aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fuse/dev.c72
-rw-r--r--fs/fuse/dir.c278
-rw-r--r--fs/fuse/file.c49
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c14
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/jbd/checkpoint.c418
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfsxdr.c48
-rw-r--r--fs/nfsd/vfs.c40
-rw-r--r--fs/partitions/Kconfig2
-rw-r--r--fs/partitions/ibm.c30
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/ramfs/Makefile4
-rw-r--r--fs/ramfs/file-mmu.c57
-rw-r--r--fs/ramfs/file-nommu.c292
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/ramfs/internal.h15
21 files changed, 987 insertions, 433 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 22533cce061..e75a9548da8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
lru_cache_add_active(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
- page_add_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address);
pte_unmap_unlock(pte, ptl);
/* no need for flush_tlb */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f4..e08ab4702d9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
spin_unlock(&fuse_lock);
}
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ int i;
+ struct fuse_init_out *arg = &req->misc.init_out;
+
+ if (arg->major != FUSE_KERNEL_VERSION)
+ fc->conn_error = 1;
+ else {
+ fc->minor = arg->minor;
+ fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+ }
+
+ /* After INIT reply is received other requests can go
+ out. So do (FUSE_MAX_OUTSTANDING - 1) number of
+ up()s on outstanding_sem. The last up() is done in
+ fuse_putback_request() */
+ for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+ up(&fc->outstanding_sem);
+}
+
/*
* This function is called when a request is finished. Either a reply
* has arrived or it was interrupted (and not yet sent) or some error
@@ -172,19 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
up_read(&fc->sbput_sem);
}
wake_up(&req->waitq);
- if (req->in.h.opcode == FUSE_INIT) {
- int i;
-
- if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
- fc->conn_error = 1;
-
- /* After INIT reply is received other requests can go
- out. So do (FUSE_MAX_OUTSTANDING - 1) number of
- up()s on outstanding_sem. The last up() is done in
- fuse_putback_request() */
- for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
- up(&fc->outstanding_sem);
- } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
+ if (req->in.h.opcode == FUSE_INIT)
+ process_init_reply(fc, req);
+ else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
/* Special case for failed iget in CREATE */
u64 nodeid = req->in.h.nodeid;
__fuse_get_request(req);
@@ -357,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
/* This is called from fuse_read_super() so there's guaranteed
to be a request available */
struct fuse_req *req = do_get_request(fc);
- struct fuse_init_in_out *arg = &req->misc.init_in_out;
+ struct fuse_init_in *arg = &req->misc.init_in;
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
req->in.h.opcode = FUSE_INIT;
@@ -365,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
req->in.args[0].size = sizeof(*arg);
req->in.args[0].value = arg;
req->out.numargs = 1;
- req->out.args[0].size = sizeof(*arg);
- req->out.args[0].value = arg;
+ /* Variable length arguement used for backward compatibility
+ with interface version < 7.5. Rest of init_out is zeroed
+ by do_get_request(), so a short reply is not a problem */
+ req->out.argvar = 1;
+ req->out.args[0].size = sizeof(struct fuse_init_out);
+ req->out.args[0].value = &req->misc.init_out;
request_send_background(fc, req);
}
@@ -615,6 +629,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
struct fuse_copy_state cs;
unsigned reqsize;
+ restart:
spin_lock(&fuse_lock);
fc = file->private_data;
err = -EPERM;
@@ -630,20 +645,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
req = list_entry(fc->pending.next, struct fuse_req, list);
list_del_init(&req->list);
- spin_unlock(&fuse_lock);
in = &req->in;
- reqsize = req->in.h.len;
- fuse_copy_init(&cs, 1, req, iov, nr_segs);
- err = -EINVAL;
- if (iov_length(iov, nr_segs) >= reqsize) {
- err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
- if (!err)
- err = fuse_copy_args(&cs, in->numargs, in->argpages,
- (struct fuse_arg *) in->args, 0);
+ reqsize = in->h.len;
+ /* If request is too large, reply with an error and restart the read */
+ if (iov_length(iov, nr_segs) < reqsize) {
+ req->out.h.error = -EIO;
+ /* SETXATTR is special, since it may contain too large data */
+ if (in->h.opcode == FUSE_SETXATTR)
+ req->out.h.error = -E2BIG;
+ request_end(fc, req);
+ goto restart;
}
+ spin_unlock(&fuse_lock);
+ fuse_copy_init(&cs, 1, req, iov, nr_segs);
+ err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+ if (!err)
+ err = fuse_copy_args(&cs, in->numargs, in->argpages,
+ (struct fuse_arg *) in->args, 0);
fuse_copy_finish(&cs);
-
spin_lock(&fuse_lock);
req->locked = 0;
if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51f5da65277..417bcee466f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,8 +13,16 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/namei.h>
-#include <linux/mount.h>
+/*
+ * FUSE caches dentries and attributes with separate timeout. The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
static inline unsigned long time_to_jiffies(unsigned long sec,
unsigned long nsec)
{
@@ -22,6 +30,50 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
return jiffies + timespec_to_jiffies(&ts);
}
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
+{
+ entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
+ if (entry->d_inode)
+ get_fuse_inode(entry->d_inode)->i_time =
+ time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+ get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+static void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+ entry->d_time = jiffies - 1;
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
+}
+
static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
struct dentry *entry,
struct fuse_entry_out *outarg)
@@ -37,17 +89,34 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
req->out.args[0].value = outarg;
}
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup. If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more. If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
{
- if (!entry->d_inode || is_bad_inode(entry->d_inode))
+ struct inode *inode = entry->d_inode;
+
+ if (inode && is_bad_inode(inode))
return 0;
else if (time_after(jiffies, entry->d_time)) {
int err;
struct fuse_entry_out outarg;
- struct inode *inode = entry->d_inode;
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_request(fc);
+ struct fuse_conn *fc;
+ struct fuse_req *req;
+
+ /* Doesn't hurt to "reset" the validity timeout */
+ fuse_invalidate_entry_cache(entry);
+ if (!inode)
+ return 0;
+
+ fc = get_fuse_conn(inode);
+ req = fuse_get_request(fc);
if (!req)
return 0;
@@ -55,6 +124,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
request_send(fc, req);
err = req->out.h.error;
if (!err) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
if (outarg.nodeid != get_node_id(inode)) {
fuse_send_forget(fc, req, outarg.nodeid, 1);
return 0;
@@ -66,18 +136,18 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
return 0;
fuse_change_attributes(inode, &outarg.attr);
- entry->d_time = time_to_jiffies(outarg.entry_valid,
- outarg.entry_valid_nsec);
- fi->i_time = time_to_jiffies(outarg.attr_valid,
- outarg.attr_valid_nsec);
+ fuse_change_timeout(entry, &outarg);
}
return 1;
}
+/*
+ * Check if there's already a hashed alias of this directory inode.
+ * If yes, then lookup and mkdir must not create a new alias.
+ */
static int dir_alias(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
- /* Don't allow creating an alias to a directory */
struct dentry *alias = d_find_alias(inode);
if (alias) {
dput(alias);
@@ -96,8 +166,14 @@ static struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
};
-static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
- struct inode **inodep)
+static inline int valid_mode(int m)
+{
+ return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+ S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+ struct nameidata *nd)
{
int err;
struct fuse_entry_out outarg;
@@ -106,53 +182,49 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
struct fuse_req *req;
if (entry->d_name.len > FUSE_NAME_MAX)
- return -ENAMETOOLONG;
+ return ERR_PTR(-ENAMETOOLONG);
req = fuse_get_request(fc);
if (!req)
- return -EINTR;
+ return ERR_PTR(-EINTR);
fuse_lookup_init(req, dir, entry, &outarg);
request_send(fc, req);
err = req->out.h.error;
- if (!err && invalid_nodeid(outarg.nodeid))
+ if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
+ !valid_mode(outarg.attr.mode)))
err = -EIO;
- if (!err) {
+ if (!err && outarg.nodeid) {
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr);
if (!inode) {
fuse_send_forget(fc, req, outarg.nodeid, 1);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
}
fuse_put_request(fc, req);
if (err && err != -ENOENT)
- return err;
+ return ERR_PTR(err);
- if (inode) {
- struct fuse_inode *fi = get_fuse_inode(inode);
- entry->d_time = time_to_jiffies(outarg.entry_valid,
- outarg.entry_valid_nsec);
- fi->i_time = time_to_jiffies(outarg.attr_valid,
- outarg.attr_valid_nsec);
+ if (inode && dir_alias(inode)) {
+ iput(inode);
+ return ERR_PTR(-EIO);
}
-
+ d_add(entry, inode);
entry->d_op = &fuse_dentry_operations;
- *inodep = inode;
- return 0;
-}
-
-void fuse_invalidate_attr(struct inode *inode)
-{
- get_fuse_inode(inode)->i_time = jiffies - 1;
-}
-
-static void fuse_invalidate_entry(struct dentry *entry)
-{
- d_invalidate(entry);
- entry->d_time = jiffies - 1;
+ if (!err)
+ fuse_change_timeout(entry, &outarg);
+ else
+ fuse_invalidate_entry_cache(entry);
+ return NULL;
}
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
struct nameidata *nd)
{
@@ -163,7 +235,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
struct fuse_open_in inarg;
struct fuse_open_out outopen;
struct fuse_entry_out outentry;
- struct fuse_inode *fi;
struct fuse_file *ff;
struct file *file;
int flags = nd->intent.open.flags - 1;
@@ -172,10 +243,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
if (fc->no_create)
goto out;
- err = -ENAMETOOLONG;
- if (entry->d_name.len > FUSE_NAME_MAX)
- goto out;
-
err = -EINTR;
req = fuse_get_request(fc);
if (!req)
@@ -220,17 +287,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
ff->fh = outopen.fh;
+ /* Special release, with inode = NULL, this will
+ trigger a 'forget' request when the release is
+ complete */
fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
goto out_put_request;
}
fuse_put_request(fc, req);
- entry->d_time = time_to_jiffies(outentry.entry_valid,
- outentry.entry_valid_nsec);
- fi = get_fuse_inode(inode);
- fi->i_time = time_to_jiffies(outentry.attr_valid,
- outentry.attr_valid_nsec);
-
d_instantiate(entry, inode);
+ fuse_change_timeout(entry, &outentry);
file = lookup_instantiate_filp(nd, entry, generic_file_open);
if (IS_ERR(file)) {
ff->fh = outopen.fh;
@@ -248,13 +313,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
return err;
}
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
struct inode *dir, struct dentry *entry,
int mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
- struct fuse_inode *fi;
int err;
req->in.h.nodeid = get_node_id(dir);
@@ -268,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
fuse_put_request(fc, req);
return err;
}
- if (invalid_nodeid(outarg.nodeid)) {
- fuse_put_request(fc, req);
- return -EIO;
- }
+ err = -EIO;
+ if (invalid_nodeid(outarg.nodeid))
+ goto out_put_request;
+
+ if ((outarg.attr.mode ^ mode) & S_IFMT)
+ goto out_put_request;
+
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
&outarg.attr);
if (!inode) {
@@ -280,22 +350,19 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
}
fuse_put_request(fc, req);
- /* Don't allow userspace to do really stupid things... */
- if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) {
+ if (dir_alias(inode)) {
iput(inode);
return -EIO;
}
- entry->d_time = time_to_jiffies(outarg.entry_valid,
- outarg.entry_valid_nsec);
-
- fi = get_fuse_inode(inode);
- fi->i_time = time_to_jiffies(outarg.attr_valid,
- outarg.attr_valid_nsec);
-
d_instantiate(entry, inode);
+ fuse_change_timeout(entry, &outarg);
fuse_invalidate_attr(dir);
return 0;
+
+ out_put_request:
+ fuse_put_request(fc, req);
+ return err;
}
static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
@@ -355,12 +422,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
{
struct fuse_conn *fc = get_fuse_conn(dir);
unsigned len = strlen(link) + 1;
- struct fuse_req *req;
-
- if (len > FUSE_SYMLINK_MAX)
- return -ENAMETOOLONG;
-
- req = fuse_get_request(fc);
+ struct fuse_req *req = fuse_get_request(fc);
if (!req)
return -EINTR;
@@ -399,6 +461,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
inode->i_nlink = 0;
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
+ fuse_invalidate_entry_cache(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -424,6 +487,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (!err) {
entry->d_inode->i_nlink = 0;
fuse_invalidate_attr(dir);
+ fuse_invalidate_entry_cache(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -459,6 +523,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
fuse_invalidate_attr(olddir);
if (olddir != newdir)
fuse_invalidate_attr(newdir);
+
+ /* newent will end up negative */
+ if (newent->d_inode)
+ fuse_invalidate_entry_cache(newent);
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
@@ -566,6 +634,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
return 0;
}
+/*
+ * Check whether the inode attributes are still valid
+ *
+ * If the attribute validity timeout has expired, then fetch the fresh
+ * attributes with a 'getattr' request
+ *
+ * I'm not sure why cached attributes are never returned for the root
+ * inode, this is probably being too cautious.
+ */
static int fuse_revalidate(struct dentry *entry)
{
struct inode *inode = entry->d_inode;
@@ -613,6 +690,19 @@ static int fuse_access(struct inode *inode, int mask)
return err;
}
+/*
+ * Check permission. The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode. This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation. An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent. Execute permission is still checked
+ * locally based on file mode.
+ */
static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
{
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -631,14 +721,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
err = generic_permission(inode, mask, NULL);
}
- /* FIXME: Need some mechanism to revoke permissions:
- currently if the filesystem suddenly changes the
- file mode, we will not be informed about it, and
- continue to allow access to the file/directory.
-
- This is actually not so grave, since the user can
- simply keep access to the file/directory anyway by
- keeping it open... */
+ /* Note: the opposite of the above test does not
+ exist. So if permissions are revoked this won't be
+ noticed immediately, only after the attribute
+ timeout has expired */
return err;
} else {
@@ -691,7 +777,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
struct page *page;
struct inode *inode = file->f_dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_request(fc);
+ struct fuse_req *req;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ req = fuse_get_request(fc);
if (!req)
return -EINTR;
@@ -806,6 +897,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
}
}
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case. So do the rlimit
+ * checking by hand and call vmtruncate() only after the file has
+ * actually been truncated.
+ */
static int fuse_setattr(struct dentry *entry, struct iattr *attr)
{
struct inode *inode = entry->d_inode;
@@ -883,23 +983,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
return err;
}
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
- struct nameidata *nd)
-{
- struct inode *inode;
- int err;
-
- err = fuse_lookup_iget(dir, entry, &inode);
- if (err)
- return ERR_PTR(err);
- if (inode && dir_alias(inode)) {
- iput(inode);
- return ERR_PTR(-EIO);
- }
- d_add(entry, inode);
- return NULL;
-}
-
static int fuse_setxattr(struct dentry *entry, const char *name,
const void *value, size_t size, int flags)
{
@@ -909,9 +992,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
struct fuse_setxattr_in inarg;
int err;
- if (size > FUSE_XATTR_SIZE_MAX)
- return -E2BIG;
-
if (fc->no_setxattr)
return -EOPNOTSUPP;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca86141d13..05dedddf428 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -163,6 +163,9 @@ static int fuse_flush(struct file *file)
struct fuse_flush_in inarg;
int err;
+ if (is_bad_inode(inode))
+ return -EIO;
+
if (fc->no_flush)
return 0;
@@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
struct fuse_fsync_in inarg;
int err;
+ if (is_bad_inode(inode))
+ return -EIO;
+
if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
return 0;
@@ -272,16 +278,22 @@ static int fuse_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
- struct fuse_req *req = fuse_get_request(fc);
- int err = -EINTR;
+ struct fuse_req *req;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ err = -EINTR;
+ req = fuse_get_request(fc);
if (!req)
goto out;
req->out.page_zeroing = 1;
req->num_pages = 1;
req->pages[0] = page;
- fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+ fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err)
@@ -295,7 +307,7 @@ static int fuse_readpage(struct file *file, struct page *page)
static int fuse_send_readpages(struct fuse_req *req, struct file *file,
struct inode *inode)
{
- loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+ loff_t pos = page_offset(req->pages[0]);
size_t count = req->num_pages << PAGE_CACHE_SHIFT;
unsigned i;
req->out.page_zeroing = 1;
@@ -345,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_readpages_data data;
int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
data.file = file;
data.inode = inode;
data.req = fuse_get_request(fc);
@@ -402,8 +418,13 @@ static int fuse_commit_write(struct file *file, struct page *page,
unsigned count = to - offset;
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
- struct fuse_req *req = fuse_get_request(fc);
+ loff_t pos = page_offset(page) + offset;
+ struct fuse_req *req;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ req = fuse_get_request(fc);
if (!req)
return -EINTR;
@@ -454,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+ npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
down_read(&current->mm->mmap_sem);
npages = get_user_pages(current, current->mm, user_addr, npages, write,
0, req->pages, NULL);
@@ -475,12 +496,16 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
ssize_t res = 0;
- struct fuse_req *req = fuse_get_request(fc);
+ struct fuse_req *req;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ req = fuse_get_request(fc);
if (!req)
return -EINTR;
while (count) {
- size_t tmp;
size_t nres;
size_t nbytes = min(count, nmax);
int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -488,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
res = err;
break;
}
- tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
- nbytes = min(nbytes, tmp);
+ nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+ nbytes = min(count, nbytes);
if (write)
nres = fuse_send_write(req, file, inode, pos, nbytes);
else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86b..74c8d098a14 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,9 @@
/** If more requests are outstanding, then the operation will block */
#define FUSE_MAX_OUTSTANDING 10
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
module will check permissions based on the file mode. Otherwise no
permission checking is done in the kernel */
@@ -108,9 +111,6 @@ struct fuse_out {
struct fuse_arg args[3];
};
-struct fuse_req;
-struct fuse_conn;
-
/**
* A request to the client
*/
@@ -159,7 +159,8 @@ struct fuse_req {
union {
struct fuse_forget_in forget_in;
struct fuse_release_in release_in;
- struct fuse_init_in_out init_in_out;
+ struct fuse_init_in init_in;
+ struct fuse_init_out init_out;
} misc;
/** page vector */
@@ -272,6 +273,9 @@ struct fuse_conn {
/** Is create not implemented by fs? */
unsigned no_create : 1;
+ /** Negotiated minor version */
+ unsigned minor;
+
/** Backing dev info */
struct backing_dev_info bdi;
};
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d..04c80cc957a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
fuse_init_common(inode);
init_special_inode(inode, inode->i_mode,
new_decode_dev(attr->rdev));
- } else {
- /* Don't let user create weird files */
- inode->i_mode = S_IFREG;
- fuse_init_common(inode);
- fuse_init_file_inode(inode);
- }
+ } else
+ BUG();
}
static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -218,6 +214,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
{
stbuf->f_type = FUSE_SUPER_MAGIC;
stbuf->f_bsize = attr->bsize;
+ stbuf->f_frsize = attr->frsize;
stbuf->f_blocks = attr->blocks;
stbuf->f_bfree = attr->bfree;
stbuf->f_bavail = attr->bavail;
@@ -238,10 +235,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
if (!req)
return -EINTR;
+ memset(&outarg, 0, sizeof(outarg));
req->in.numargs = 0;
req->in.h.opcode = FUSE_STATFS;
req->out.numargs = 1;
- req->out.args[0].size = sizeof(outarg);
+ req->out.args[0].size =
+ fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
req->out.args[0].value = &outarg;
request_send(fc, req);
err = req->out.h.error;
@@ -482,7 +481,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->max_read = d.max_read;
if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
- fc->max_write = FUSE_MAX_IN / 2;
err = -ENOMEM;
root = get_root_inode(sb, d.rootmode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8093351bd7c..6daaf7c755a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,7 +320,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
/* temporarily use utf8 to correctly find the hidden dir below */
nls = sbi->nls;
sbi->nls = load_nls("utf8");
- if (!nls) {
+ if (!sbi->nls) {
printk("HFS+: unable to load nls for utf8\n");
err = -EINVAL;
goto cleanup;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3bb67..8c41315a6e4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
loff_t len, vma_len;
int ret;
- if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
- return -EINVAL;
-
if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
return -EINVAL;
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d..cb3cef525c3 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
#include <linux/slab.h>
/*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
-static inline void __buffer_unlink(struct journal_head *jh)
+static void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction;
transaction = jh->b_cp_transaction;
- jh->b_cp_transaction = NULL;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
- if (transaction->t_checkpoint_list == jh)
+ if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
- if (transaction->t_checkpoint_list == jh)
- transaction->t_checkpoint_list = NULL;
+ if (transaction->t_checkpoint_list == jh)
+ transaction->t_checkpoint_list = NULL;
+ }
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+ transaction_t *transaction;
+
+ transaction = jh->b_cp_transaction;
+
+ __buffer_unlink_first(jh);
+ if (transaction->t_checkpoint_io_list == jh) {
+ transaction->t_checkpoint_io_list = jh->b_cpnext;
+ if (transaction->t_checkpoint_io_list == jh)
+ transaction->t_checkpoint_io_list = NULL;
+ }
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+ transaction_t *transaction;
+
+ transaction = jh->b_cp_transaction;
+ __buffer_unlink_first(jh);
+
+ if (!transaction->t_checkpoint_io_list) {
+ jh->b_cpnext = jh->b_cpprev = jh;
+ } else {
+ jh->b_cpnext = transaction->t_checkpoint_io_list;
+ jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh;
+ jh->b_cpnext->b_cpprev = jh;
+ }
+ transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
- __journal_remove_checkpoint(jh);
+ ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
- ret = 1;
} else {
jbd_unlock_bh_state(bh);
}
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
}
/*
- * Clean up a transaction's checkpoint list.
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint. (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
*
* Called with j_list_lock held.
*/
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
- struct journal_head *jh, *next_jh, *last_jh;
+ struct journal_head *jh;
struct buffer_head *bh;
- int ret = 0;
-
- assert_spin_locked(&journal->j_list_lock);
- jh = transaction->t_checkpoint_list;
- if (!jh)
- return 0;
-
- last_jh = jh->b_cpprev;
- next_jh = jh;
- do {
- jh = next_jh;
+ tid_t this_tid;
+ int released = 0;
+
+ this_tid = transaction->t_tid;
+restart:
+ /* Didn't somebody clean up the transaction in the meanwhile */
+ if (journal->j_checkpoint_transactions != transaction ||
+ transaction->t_tid != this_tid)
+ return;
+ while (!released && transaction->t_checkpoint_io_list) {
+ jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
+ if (!jbd_trylock_bh_state(bh)) {
+ jbd_sync_bh(journal, bh);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
+ }
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
- goto out_return_1;
- }
-
- /*
- * This is foul
- */
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- goto out_return_1;
+ spin_lock(&journal->j_list_lock);
+ goto restart;
}
-
- if (jh->b_transaction != NULL) {
- transaction_t *t = jh->b_transaction;
- tid_t tid = t->t_tid;
-
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- log_start_commit(journal, tid);
- log_wait_commit(journal, tid);
- goto out_return_1;
- }
-
/*
- * AKPM: I think the buffer_jbddirty test is redundant - it
- * shouldn't have NULL b_transaction?
+ * Now in whatever state the buffer currently is, we know that
+ * it has been written out and so we can drop it from the list
*/
- next_jh = jh->b_cpnext;
- if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
- BUFFER_TRACE(bh, "remove from checkpoint");
- __journal_remove_checkpoint(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- __brelse(bh);
- ret = 1;
- } else {
- jbd_unlock_bh_state(bh);
- }
- } while (jh != last_jh);
-
- return ret;
-out_return_1:
- spin_lock(&journal->j_list_lock);
- return 1;
+ released = __journal_remove_checkpoint(jh);
+ jbd_unlock_bh_state(bh);
+ }
}
#define NR_BATCH 64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
- spin_unlock(&journal->j_list_lock);
ll_rw_block(SWRITE, *batch_count, bhs);
- spin_lock(&journal->j_list_lock);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list.
*
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
- struct buffer_head **bhs, int *batch_count,
- int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+ struct buffer_head **bhs, int *batch_count)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
- if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
- J_ASSERT_JH(jh, jh->b_transaction == NULL);
+ if (buffer_locked(bh)) {
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ put_bh(bh);
+ ret = 1;
+ }
+ else if (jh->b_transaction != NULL) {
+ transaction_t *t = jh->b_transaction;
+ tid_t tid = t->t_tid;
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ log_start_commit(journal, tid);
+ log_wait_commit(journal, tid);
+ ret = 1;
+ }
+ else if (!buffer_dirty(bh)) {
+ J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+ BUFFER_TRACE(bh, "remove from checkpoint");
+ __journal_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ journal_remove_journal_head(bh);
+ put_bh(bh);
+ ret = 1;
+ }
+ else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
bhs[*batch_count] = bh;
+ __buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
(*batch_count)++;
if (*batch_count == NR_BATCH) {
+ spin_unlock(&journal->j_list_lock);
__flush_batch(journal, bhs, batch_count);
ret = 1;
}
- } else {
- int last_buffer = 0;
- if (jh->b_cpnext == jh) {
- /* We may be about to drop the transaction. Tell the
- * caller that the lists have changed.
- */
- last_buffer = 1;
- }
- if (__try_to_free_cp_buf(jh)) {
- (*drop_count)++;
- ret = last_buffer;
- }
}
return ret;
}
/*
- * Perform an actual checkpoint. We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency. __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
*
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
* The journal should be locked before calling this function.
*/
int log_do_checkpoint(journal_t *journal)
{
+ transaction_t *transaction;
+ tid_t this_tid;
int result;
- int batch_count = 0;
- struct buffer_head *bhs[NR_BATCH];
jbd_debug(1, "Start checkpoint\n");
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
return result;
/*
- * OK, we need to start writing disk blocks. Try to free up a
- * quarter of the log in a single checkpoint if we can.
+ * OK, we need to start writing disk blocks. Take one transaction
+ * and write it.
*/
+ spin_lock(&journal->j_list_lock);
+ if (!journal->j_checkpoint_transactions)
+ goto out;
+ transaction = journal->j_checkpoint_transactions;
+ this_tid = transaction->t_tid;
+restart:
/*
- * AKPM: check this code. I had a feeling a while back that it
- * degenerates into a busy loop at unmount time.
+ * If someone cleaned up this transaction while we slept, we're
+ * done (maybe it's a new transaction, but it fell at the same
+ * address).
*/
- spin_lock(&journal->j_list_lock);
- while (journal->j_checkpoint_transactions) {
- transaction_t *transaction;
- struct journal_head *jh, *last_jh, *next_jh;
- int drop_count = 0;
- int cleanup_ret, retry = 0;
- tid_t this_tid;
-
- transaction = journal->j_checkpoint_transactions;
- this_tid = transaction->t_tid;
- jh = transaction->t_checkpoint_list;
- last_jh = jh->b_cpprev;
- next_jh = jh;
- do {
+ if (journal->j_checkpoint_transactions == transaction ||
+ transaction->t_tid == this_tid) {
+ int batch_count = 0;
+ struct buffer_head *bhs[NR_BATCH];
+ struct journal_head *jh;
+ int retry = 0;
+
+ while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
- jh = next_jh;
- next_jh = jh->b_cpnext;
+ jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
- spin_lock(&journal->j_list_lock);
retry = 1;
break;
}
- retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
- if (cond_resched_lock(&journal->j_list_lock)) {
+ retry = __process_buffer(journal, jh, bhs,
+ &batch_count);
+ if (!retry &&
+ lock_need_resched(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
- } while (jh != last_jh && !retry);
+ }
if (batch_count) {
+ if (!retry) {
+ spin_unlock(&journal->j_list_lock);
+ retry = 1;
+ }
__flush_batch(journal, bhs, &batch_count);
- retry = 1;
}
+ if (retry) {
+ spin_lock(&journal->j_list_lock);
+ goto restart;
+ }
/*
- * If someone cleaned up this transaction while we slept, we're
- * done
- */
- if (journal->j_checkpoint_transactions != transaction)
- break;
- if (retry)
- continue;
- /*
- * Maybe it's a new transaction, but it fell at the same
- * address
- */
- if (transaction->t_tid != this_tid)
- continue;
- /*
- * We have walked the whole transaction list without
- * finding anything to write to disk. We had better be
- * able to make some progress or we are in trouble.
+ * Now we have cleaned up the first transaction's checkpoint
+ * list. Let's clean up the second one.
*/
- cleanup_ret = __cleanup_transaction(journal, transaction);
- J_ASSERT(drop_count != 0 || cleanup_ret != 0);
- if (journal->j_checkpoint_transactions != transaction)
- break;
+ __wait_cp_io(journal, transaction);
}
+out:
spin_unlock(&journal->j_list_lock);
result = cleanup_journal_tail(journal);
if (result < 0)
return result;
-
return 0;
}
@@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+ struct journal_head *last_jh;
+ struct journal_head *next_jh = jh;
+ int ret, freed = 0;
+
+ *released = 0;
+ if (!jh)
+ return 0;
+
+ last_jh = jh->b_cpprev;
+ do {
+ jh = next_jh;
+ next_jh = jh->b_cpnext;
+ /* Use trylock because of the ranking */
+ if (jbd_trylock_bh_state(jh2bh(jh))) {
+ ret = __try_to_free_cp_buf(jh);
+ if (ret) {
+ freed++;
+ if (ret == 2) {
+ *released = 1;
+ return freed;
+ }
+ }
+ }
+ /*
+ * This function only frees up some memory if possible so we
+ * dont have an obligation to finish processing. Bail out if
+ * preemption requested:
+ */
+ if (need_resched())
+ return freed;
+ } while (jh != last_jh);
+
+ return freed;
+}
+
+/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
*/
int __journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
- int ret = 0;
+ int ret = 0, released;
transaction = journal->j_checkpoint_transactions;
- if (transaction == 0)
+ if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
- struct journal_head *jh;
-
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- jh = transaction->t_checkpoint_list;
- if (jh) {
- struct journal_head *last_jh = jh->b_cpprev;
- struct journal_head *next_jh = jh;
-
- do {
- jh = next_jh;
- next_jh = jh->b_cpnext;
- /* Use trylock because of the ranknig */
- if (jbd_trylock_bh_state(jh2bh(jh)))
- ret += __try_to_free_cp_buf(jh);
- /*
- * This function only frees up some memory
- * if possible so we dont have an obligation
- * to finish processing. Bail out if preemption
- * requested:
- */
- if (need_resched())
- goto out;
- } while (jh != last_jh);
- }
+ ret += journal_clean_one_cp_list(transaction->
+ t_checkpoint_list, &released);
+ if (need_resched())
+ goto out;
+ if (released)
+ continue;
+ /*
+ * It is essential that we are as careful as in the case of
+ * t_checkpoint_list with removing the buffer from the list as
+ * we can possibly see not yet submitted buffers on io_list
+ */
+ ret += journal_clean_one_cp_list(transaction->
+ t_checkpoint_io_list, &released);
+ if (need_resched())
+ goto out;
} while (transaction != last_transaction);
out:
return ret;
@@ -516,18 +571,22 @@ out:
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
*
* This function is called with the journal locked.
* This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
{
transaction_t *transaction;
journal_t *journal;
+ int ret = 0;
JBUFFER_TRACE(jh, "entry");
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
journal = transaction->t_journal;
__buffer_unlink(jh);
+ jh->b_cp_transaction = NULL;
- if (transaction->t_checkpoint_list != NULL)
+ if (transaction->t_checkpoint_list != NULL ||
+ transaction->t_checkpoint_io_list != NULL)
goto out;
JBUFFER_TRACE(jh, "transaction has no more buffers");
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
+ ret = 1;
out:
JBUFFER_TRACE(jh, "exit");
+ return ret;
}
/*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
+ J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667..6d2dfed1de0 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
struct nfsd3_attrstat *resp)
{
- int nfserr;
+ int err, nfserr;
dprintk("nfsd: GETATTR(3) %s\n",
- SVCFH_fmt(&argp->fh));
+ SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+ if (nfserr)
+ RETURN_STATUS(nfserr);
+
+ err = vfs_getattr(resp->fh.fh_export->ex_mnt,
+ resp->fh.fh_dentry, &resp->stat);
+ nfserr = nfserrno(err);
+
RETURN_STATUS(nfserr);
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d0..243d94b9653 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
}
static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+ struct kstat *stat)
{
- struct vfsmount *mnt = fhp->fh_export->ex_mnt;
struct dentry *dentry = fhp->fh_dentry;
- struct kstat stat;
struct timespec time;
- vfs_getattr(mnt, dentry, &stat);
-
- *p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]);
- *p++ = htonl((u32) stat.mode);
- *p++ = htonl((u32) stat.nlink);
- *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
- *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
- if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
+ *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+ *p++ = htonl((u32) stat->mode);
+ *p++ = htonl((u32) stat->nlink);
+ *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+ *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+ if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
} else {
- p = xdr_encode_hyper(p, (u64) stat.size);
+ p = xdr_encode_hyper(p, (u64) stat->size);
}
- p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9);
- *p++ = htonl((u32) MAJOR(stat.rdev));
- *p++ = htonl((u32) MINOR(stat.rdev));
+ p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+ *p++ = htonl((u32) MAJOR(stat->rdev));
+ *p++ = htonl((u32) MINOR(stat->rdev));
if (is_fsid(fhp, rqstp->rq_reffh))
p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
else
- p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev));
- p = xdr_encode_hyper(p, (u64) stat.ino);
- p = encode_time3(p, &stat.atime);
+ p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
+ p = xdr_encode_hyper(p, (u64) stat->ino);
+ p = encode_time3(p, &stat->atime);
lease_get_mtime(dentry->d_inode, &time);
p = encode_time3(p, &time);
- p = encode_time3(p, &stat.ctime);
+ p = encode_time3(p, &stat->ctime);
return p;
}
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
if (dentry && dentry->d_inode != NULL) {
- *p++ = xdr_one; /* attributes follow */
- return encode_fattr3(rqstp, p, fhp);
+ int err;
+ struct kstat stat;
+
+ err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
+ if (!err) {
+ *p++ = xdr_one; /* attributes follow */
+ return encode_fattr3(rqstp, p, fhp, &stat);
+ }
}
*p++ = xdr_zero;
return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
struct nfsd3_attrstat *resp)
{
if (resp->status == 0)
- p = encode_fattr3(rqstp, p, &resp->fh);
+ p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e..aa7bb41b293 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
}
static inline u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+ struct kstat *stat)
{
- struct vfsmount *mnt = fhp->fh_export->ex_mnt;
struct dentry *dentry = fhp->fh_dentry;
- struct kstat stat;
int type;
struct timespec time;
- vfs_getattr(mnt, dentry, &stat);
- type = (stat.mode & S_IFMT);
+ type = (stat->mode & S_IFMT);
*p++ = htonl(nfs_ftypes[type >> 12]);
- *p++ = htonl((u32) stat.mode);
- *p++ = htonl((u32) stat.nlink);
- *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
- *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
+ *p++ = htonl((u32) stat->mode);
+ *p++ = htonl((u32) stat->nlink);
+ *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+ *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
- if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) {
+ if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
*p++ = htonl(NFS_MAXPATHLEN);
} else {
- *p++ = htonl((u32) stat.size);
+ *p++ = htonl((u32) stat->size);
}
- *p++ = htonl((u32) stat.blksize);
+ *p++ = htonl((u32) stat->blksize);
if (S_ISCHR(type) || S_ISBLK(type))
- *p++ = htonl(new_encode_dev(stat.rdev));
+ *p++ = htonl(new_encode_dev(stat->rdev));
else
*p++ = htonl(0xffffffff);
- *p++ = htonl((u32) stat.blocks);
+ *p++ = htonl((u32) stat->blocks);
if (is_fsid(fhp, rqstp->rq_reffh))
*p++ = htonl((u32) fhp->fh_export->ex_fsid);
else
- *p++ = htonl(new_encode_dev(stat.dev));
- *p++ = htonl((u32) stat.ino);
- *p++ = htonl((u32) stat.atime.tv_sec);
- *p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0);
+ *p++ = htonl(new_encode_dev(stat->dev));
+ *p++ = htonl((u32) stat->ino);
+ *p++ = htonl((u32) stat->atime.tv_sec);
+ *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
lease_get_mtime(dentry->d_inode, &time);
*p++ = htonl((u32) time.tv_sec);
*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
- *p++ = htonl((u32) stat.ctime.tv_sec);
- *p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0);
+ *p++ = htonl((u32) stat->ctime.tv_sec);
+ *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
return p;
}
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
/* Helper function for NFSv2 ACL code */
u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
{
- return encode_fattr(rqstp, p, fhp);
+ struct kstat stat;
+ vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
+ return encode_fattr(rqstp, p, fhp, &stat);
}
/*
@@ -394,7 +394,7 @@ int
nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
struct nfsd_attrstat *resp)
{
- p = encode_fattr(rqstp, p, &resp->fh);
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
return xdr_ressize_check(rqstp, p);
}
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
struct nfsd_diropres *resp)
{
p = encode_fh(p, &resp->fh);
- p = encode_fattr(rqstp, p, &resp->fh);
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
return xdr_ressize_check(rqstp, p);
}
@@ -428,7 +428,7 @@ int
nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
struct nfsd_readres *resp)
{
- p = encode_fattr(rqstp, p, &resp->fh);
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->count);
xdr_ressize_check(rqstp, p);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b..df4019f0456 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
* As this calls fsync (not fdatasync) there is no need for a write_inode
* after it.
*/
-static inline void nfsd_dosync(struct file *filp, struct dentry *dp,
- struct file_operations *fop)
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+ struct file_operations *fop)
{
struct inode *inode = dp->d_inode;
int (*fsync) (struct file *, struct dentry *, int);
+ int err = nfs_ok;
filemap_fdatawrite(inode->i_mapping);
if (fop && (fsync = fop->fsync))
- fsync(filp, dp, 0);
+ err=fsync(filp, dp, 0);
filemap_fdatawait(inode->i_mapping);
+
+ return nfserrno(err);
}
-static void
+static int
nfsd_sync(struct file *filp)
{
+ int err;
struct inode *inode = filp->f_dentry->d_inode;
dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
down(&inode->i_sem);
- nfsd_dosync(filp, filp->f_dentry, filp->f_op);
+ err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
up(&inode->i_sem);
+
+ return err;
}
void
@@ -874,6 +880,16 @@ out:
return err;
}
+static void kill_suid(struct dentry *dentry)
+{
+ struct iattr ia;
+ ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
+
+ down(&dentry->d_inode->i_sem);
+ notify_change(dentry, &ia);
+ up(&dentry->d_inode->i_sem);
+}
+
static inline int
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
@@ -927,14 +943,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
}
/* clear setuid/setgid flag after write */
- if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) {
- struct iattr ia;
- ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
-
- down(&inode->i_sem);
- notify_change(dentry, &ia);
- up(&inode->i_sem);
- }
+ if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+ kill_suid(dentry);
if (err >= 0 && stable) {
static ino_t last_ino;
@@ -962,7 +972,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (inode->i_state & I_DIRTY) {
dprintk("nfsd: write sync %d\n", current->pid);
- nfsd_sync(file);
+ err=nfsd_sync(file);
}
#if 0
wake_up(&inode->i_wait);
@@ -1066,7 +1076,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
if (EX_ISSYNC(fhp->fh_export)) {
if (file->f_op && file->f_op->fsync) {
- nfsd_sync(file);
+ err = nfsd_sync(file);
} else {
err = nfserr_notsupp;
}
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43431b..e227a04261a 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
config IBM_PARTITION
bool "IBM disk label and partition support"
- depends on PARTITION_ADVANCED && ARCH_S390
+ depends on PARTITION_ADVANCED && S390
help
Say Y here if you would like to be able to read the hard disk
partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb2d73..78010ad60e4 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
struct hd_geometry *geo;
char type[5] = {0,};
char name[7] = {0,};
- struct vtoc_volume_label *vlabel;
+ union label_t {
+ struct vtoc_volume_label vol;
+ struct vtoc_cms_label cms;
+ } *label;
unsigned char *data;
Sector sect;
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
goto out_noinfo;
if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
goto out_nogeo;
- if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label),
- GFP_KERNEL)) == NULL)
- goto out_novlab;
+ if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
+ goto out_nolab;
if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
strncpy(name, data + 8, 6);
else
strncpy(name, data + 4, 6);
- memcpy (vlabel, data, sizeof(struct vtoc_volume_label));
+ memcpy(label, data, sizeof(union label_t));
put_dev_sector(sect);
EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
/*
* VM style CMS1 labeled disk
*/
- int *label = (int *) vlabel;
-
- if (label[13] != 0) {
+ if (label->cms.disk_offset != 0) {
printk("CMS1/%8s(MDSK):", name);
/* disk is reserved minidisk */
- blocksize = label[3];
- offset = label[13];
- size = (label[7] - 1)*(blocksize >> 9);
+ blocksize = label->cms.block_size;
+ offset = label->cms.disk_offset;
+ size = (label->cms.block_count - 1) * (blocksize >> 9);
} else {
printk("CMS1/%8s:", name);
offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
printk("VOL1/%8s:", name);
/* get block number and read then go through format1 labels */
- blk = cchhb2blk(&vlabel->vtoc, geo) + 1;
+ blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
counter = 0;
while ((data = read_dev_sector(bdev, blk*(blocksize/512),
&sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
}
printk("\n");
- kfree(vlabel);
+ kfree(label);
kfree(geo);
kfree(info);
return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
out_readerr:
out_badsect:
out_noioctl:
- kfree(vlabel);
-out_novlab:
+ kfree(label);
+out_nolab:
kfree(geo);
out_nogeo:
kfree(info);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b30..5e9251f6531 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
buffer = task_sig(task, buffer);
buffer = task_cap(task, buffer);
buffer = cpuset_task_status_allowed(task, buffer);
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
buffer = task_show_regs(task, buffer);
#endif
return buffer - orig;
diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f300709..5a0236e02ee 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
obj-$(CONFIG_RAMFS) += ramfs.o
-ramfs-objs := inode.o
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 00000000000..2115383dcc8
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ * 2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+ .readpage = simple_readpage,
+ .prepare_write = simple_prepare_write,
+ .commit_write = simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+ .read = generic_file_read,
+ .write = generic_file_write,
+ .mmap = generic_file_mmap,
+ .fsync = simple_sync_file,
+ .sendfile = generic_file_sendfile,
+ .llseek = generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+ .getattr = simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 00000000000..3f810acd0bf
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+ .readpage = simple_readpage,
+ .prepare_write = simple_prepare_write,
+ .commit_write = simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+ .mmap = ramfs_nommu_mmap,
+ .get_unmapped_area = ramfs_nommu_get_unmapped_area,
+ .read = generic_file_read,
+ .write = generic_file_write,
+ .fsync = simple_sync_file,
+ .sendfile = generic_file_sendfile,
+ .llseek = generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+ .setattr = ramfs_nommu_setattr,
+ .getattr = simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+ struct pagevec lru_pvec;
+ unsigned long npages, xpages, loop, limit;
+ struct page *pages;
+ unsigned order;
+ void *data;
+ int ret;
+
+ /* make various checks */
+ order = get_order(newsize);
+ if (unlikely(order >= MAX_ORDER))
+ goto too_big;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && newsize > limit)
+ goto fsize_exceeded;
+
+ if (newsize > inode->i_sb->s_maxbytes)
+ goto too_big;
+
+ i_size_write(inode, newsize);
+
+ /* allocate enough contiguous pages to be able to satisfy the
+ * request */
+ pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+ if (!pages)
+ return -ENOMEM;
+
+ /* split the high-order page into an array of single pages */
+ xpages = 1UL << order;
+ npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ for (loop = 0; loop < npages; loop++)
+ set_page_count(pages + loop, 1);
+
+ /* trim off any pages we don't actually require */
+ for (loop = npages; loop < xpages; loop++)
+ __free_page(pages + loop);
+
+ /* clear the memory we allocated */
+ newsize = PAGE_SIZE * npages;
+ data = page_address(pages);
+ memset(data, 0, newsize);
+
+ /* attach all the pages to the inode's address space */
+ pagevec_init(&lru_pvec, 0);
+ for (loop = 0; loop < npages; loop++) {
+ struct page *page = pages + loop;
+
+ ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+ if (ret < 0)
+ goto add_error;
+
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+
+ unlock_page(page);
+ }
+
+ pagevec_lru_add(&lru_pvec);
+ return 0;
+
+ fsize_exceeded:
+ send_sig(SIGXFSZ, current, 0);
+ too_big:
+ return -EFBIG;
+
+ add_error:
+ page_cache_release(pages + loop);
+ for (loop++; loop < npages; loop++)
+ __free_page(pages + loop);
+ return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+ size_t newsize, size_t size)
+{
+ struct vm_area_struct *vma;
+ struct prio_tree_iter iter;
+
+ /* search for VMAs that fall within the dead zone */
+ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+ newsize >> PAGE_SHIFT,
+ (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+ ) {
+ /* found one - only interested if it's shared out of the page
+ * cache */
+ if (vma->vm_flags & VM_SHARED)
+ return -ETXTBSY; /* not quite true, but near enough */
+ }
+
+ return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+ int ret;
+
+ /* assume a truncate from zero size is going to be for the purposes of
+ * shared mmap */
+ if (size == 0) {
+ if (unlikely(newsize >> 32))
+ return -EFBIG;
+
+ return ramfs_nommu_expand_for_mapping(inode, newsize);
+ }
+
+ /* check that a decrease in size doesn't cut off any shared mappings */
+ if (newsize < size) {
+ ret = ramfs_nommu_check_mappings(inode, newsize, size);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = vmtruncate(inode, size);
+
+ return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+ struct inode *inode = dentry->d_inode;
+ unsigned int old_ia_valid = ia->ia_valid;
+ int ret = 0;
+
+ /* by providing our own setattr() method, we skip this quotaism */
+ if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+ (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+ ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+ /* pick out size-changing events */
+ if (ia->ia_valid & ATTR_SIZE) {
+ loff_t size = i_size_read(inode);
+ if (ia->ia_size != size) {
+ ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+ if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+ goto out;
+ } else {
+ /* we skipped the truncate but must still update
+ * timestamps
+ */
+ ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+ }
+ }
+
+ ret = inode_setattr(inode, ia);
+ out:
+ ia->ia_valid = old_ia_valid;
+ return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ * - the pages to be mapped must exist
+ * - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ unsigned long maxpages, lpages, nr, loop, ret;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct page **pages = NULL, **ptr, *page;
+ loff_t isize;
+
+ if (!(flags & MAP_SHARED))
+ return addr;
+
+ /* the mapping mustn't extend beyond the EOF */
+ lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ isize = i_size_read(inode);
+
+ ret = -EINVAL;
+ maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= maxpages)
+ goto out;
+
+ if (maxpages - pgoff < lpages)
+ goto out;
+
+ /* gang-find the pages */
+ ret = -ENOMEM;
+ pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto out;
+
+ nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+ if (nr != lpages)
+ goto out; /* leave if some pages were missing */
+
+ /* check the pages for physical adjacency */
+ ptr = pages;
+ page = *ptr++;
+ page++;
+ for (loop = lpages; loop > 1; loop--)
+ if (*ptr++ != page++)
+ goto out;
+
+ /* okay - all conditions fulfilled */
+ ret = (unsigned long) page_address(pages[0]);
+
+ out:
+ if (pages) {
+ ptr = pages;
+ for (loop = lpages; loop > 0; loop--)
+ put_page(*ptr++);
+ kfree(pages);
+ }
+
+ return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605a..c66bd5e4c05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
#include <linux/ramfs.h>
#include <asm/uaccess.h>
+#include "internal.h"
/* some random number */
#define RAMFS_MAGIC 0x858458f6
static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
static struct inode_operations ramfs_dir_inode_operations;
static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
-static struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .prepare_write = simple_prepare_write,
- .commit_write = simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
- .read = generic_file_read,
- .write = generic_file_write,
- .mmap = generic_file_mmap,
- .fsync = simple_sync_file,
- .sendfile = generic_file_sendfile,
- .llseek = generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
- .getattr = simple_getattr,
-};
-
static struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 00000000000..272c8a7120b
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;