21 files changed, 987 insertions, 433 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 22533cce061..e75a9548da8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
-	page_add_anon_rmap(page, vma, address);
+	page_add_new_anon_rmap(page, vma, address);
 	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f4..e08ab4702d9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
 	spin_unlock(&fuse_lock);
 }
 
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+	}
+
+	/* After INIT reply is received other requests can go
+	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+	   up()s on outstanding_sem.  The last up() is done in
+	   fuse_putback_request() */
+	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+		up(&fc->outstanding_sem);
+}
+
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
@@ -172,19 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		up_read(&fc->sbput_sem);
 	}
 	wake_up(&req->waitq);
-	if (req->in.h.opcode == FUSE_INIT) {
-		int i;
-
-		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
-			fc->conn_error = 1;
-
-		/* After INIT reply is received other requests can go
-		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-		   up()s on outstanding_sem.  The last up() is done in
-		   fuse_putback_request() */
-		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-			up(&fc->outstanding_sem);
-	} else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
+	if (req->in.h.opcode == FUSE_INIT)
+		process_init_reply(fc, req);
+	else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
 		/* Special case for failed iget in CREATE */
 		u64 nodeid = req->in.h.nodeid;
 		__fuse_get_request(req);
@@ -357,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
 	/* This is called from fuse_read_super() so there's guaranteed
 	   to be a request available */
 	struct fuse_req *req = do_get_request(fc);
-	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	struct fuse_init_in *arg = &req->misc.init_in;
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	req->in.h.opcode = FUSE_INIT;
@@ -365,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
 	req->in.args[0].size = sizeof(*arg);
 	req->in.args[0].value = arg;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(*arg);
-	req->out.args[0].value = arg;
+	/* Variable length arguement used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
 	request_send_background(fc, req);
 }
 
@@ -615,6 +629,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	unsigned reqsize;
 
+ restart:
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
 	err = -EPERM;
@@ -630,20 +645,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
 
 	in = &req->in;
-	reqsize = req->in.h.len;
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
-	err = -EINVAL;
-	if (iov_length(iov, nr_segs) >= reqsize) {
-		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
-		if (!err)
-			err = fuse_copy_args(&cs, in->numargs, in->argpages,
-					     (struct fuse_arg *) in->args, 0);
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (iov_length(iov, nr_segs) < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
 	}
+	spin_unlock(&fuse_lock);
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(&cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-
 	spin_lock(&fuse_lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51f5da65277..417bcee466f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,8 +13,16 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
-#include <linux/mount.h>
 
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
 static inline unsigned long time_to_jiffies(unsigned long sec,
 					    unsigned long nsec)
 {
@@ -22,6 +30,50 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 	return jiffies + timespec_to_jiffies(&ts);
 }
 
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
+{
+	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
+	if (entry->d_inode)
+		get_fuse_inode(entry->d_inode)->i_time =
+			time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+static void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+	entry->d_time = jiffies - 1;
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
+}
+
 static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 			     struct dentry *entry,
 			     struct fuse_entry_out *outarg)
@@ -37,17 +89,34 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 	req->out.args[0].value = outarg;
 }
 
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup.  If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more.  If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-	if (!entry->d_inode || is_bad_inode(entry->d_inode))
+	struct inode *inode = entry->d_inode;
+
+	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (time_after(jiffies, entry->d_time)) {
 		int err;
 		struct fuse_entry_out outarg;
-		struct inode *inode = entry->d_inode;
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_req *req = fuse_get_request(fc);
+		struct fuse_conn *fc;
+		struct fuse_req *req;
+
+		/* Doesn't hurt to "reset" the validity timeout */
+		fuse_invalidate_entry_cache(entry);
+		if (!inode)
+			return 0;
+
+		fc = get_fuse_conn(inode);
+		req = fuse_get_request(fc);
 		if (!req)
 			return 0;
 
@@ -55,6 +124,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		request_send(fc, req);
 		err = req->out.h.error;
 		if (!err) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
 				fuse_send_forget(fc, req, outarg.nodeid, 1);
 				return 0;
@@ -66,18 +136,18 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fuse_change_attributes(inode, &outarg.attr);
-		entry->d_time = time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+		fuse_change_timeout(entry, &outarg);
 	}
 	return 1;
 }
 
+/*
+ * Check if there's already a hashed alias of this directory inode.
+ * If yes, then lookup and mkdir must not create a new alias.
+ */
 static int dir_alias(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
-		/* Don't allow creating an alias to a directory  */
 		struct dentry *alias = d_find_alias(inode);
 		if (alias) {
 			dput(alias);
@@ -96,8 +166,14 @@ static struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
-static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
-			    struct inode **inodep)
+static inline int valid_mode(int m)
+{
+	return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
 {
 	int err;
 	struct fuse_entry_out outarg;
@@ -106,53 +182,49 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 	struct fuse_req *req;
 
 	if (entry->d_name.len > FUSE_NAME_MAX)
-		return -ENAMETOOLONG;
+		return ERR_PTR(-ENAMETOOLONG);
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -EINTR;
+		return ERR_PTR(-EINTR);
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && invalid_nodeid(outarg.nodeid))
+	if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
+		     !valid_mode(outarg.attr.mode)))
 		err = -EIO;
-	if (!err) {
+	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 				  &outarg.attr);
 		if (!inode) {
 			fuse_send_forget(fc, req, outarg.nodeid, 1);
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 		}
 	}
 	fuse_put_request(fc, req);
 	if (err && err != -ENOENT)
-		return err;
+		return ERR_PTR(err);
 
-	if (inode) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		entry->d_time =	time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+	if (inode && dir_alias(inode)) {
+		iput(inode);
+		return ERR_PTR(-EIO);
 	}
-
+	d_add(entry, inode);
 	entry->d_op = &fuse_dentry_operations;
-	*inodep = inode;
-	return 0;
-}
-
-void fuse_invalidate_attr(struct inode *inode)
-{
-	get_fuse_inode(inode)->i_time = jiffies - 1;
-}
-
-static void fuse_invalidate_entry(struct dentry *entry)
-{
-	d_invalidate(entry);
-	entry->d_time = jiffies - 1;
+	if (!err)
+		fuse_change_timeout(entry, &outarg);
+	else
+		fuse_invalidate_entry_cache(entry);
+	return NULL;
 }
 
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
 static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 			    struct nameidata *nd)
 {
@@ -163,7 +235,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct fuse_open_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
-	struct fuse_inode *fi;
 	struct fuse_file *ff;
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
@@ -172,10 +243,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		goto out;
 
-	err = -ENAMETOOLONG;
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		goto out;
-
 	err = -EINTR;
 	req = fuse_get_request(fc);
 	if (!req)
@@ -220,17 +287,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		ff->fh = outopen.fh;
+		/* Special release, with inode = NULL, this will
+		   trigger a 'forget' request when the release is
+		   complete */
 		fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
 		goto out_put_request;
 	}
 	fuse_put_request(fc, req);
-	entry->d_time =	time_to_jiffies(outentry.entry_valid,
-					outentry.entry_valid_nsec);
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outentry.attr_valid,
-				     outentry.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outentry);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
 		ff->fh = outopen.fh;
@@ -248,13 +313,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	return err;
 }
 
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
 static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 			    struct inode *dir, struct dentry *entry,
 			    int mode)
 {
 	struct fuse_entry_out outarg;
 	struct inode *inode;
-	struct fuse_inode *fi;
 	int err;
 
 	req->in.h.nodeid = get_node_id(dir);
@@ -268,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		fuse_put_request(fc, req);
 		return err;
 	}
-	if (invalid_nodeid(outarg.nodeid)) {
-		fuse_put_request(fc, req);
-		return -EIO;
-	}
+	err = -EIO;
+	if (invalid_nodeid(outarg.nodeid))
+		goto out_put_request;
+
+	if ((outarg.attr.mode ^ mode) & S_IFMT)
+		goto out_put_request;
+
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 			  &outarg.attr);
 	if (!inode) {
@@ -280,22 +350,19 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	fuse_put_request(fc, req);
 
-	/* Don't allow userspace to do really stupid things... */
-	if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) {
+	if (dir_alias(inode)) {
 		iput(inode);
 		return -EIO;
 	}
 
-	entry->d_time = time_to_jiffies(outarg.entry_valid,
-					outarg.entry_valid_nsec);
-
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outarg.attr_valid,
-				     outarg.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
+
+ out_put_request:
+	fuse_put_request(fc, req);
+	return err;
 }
 
 static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
@@ -355,12 +422,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req;
-
-	if (len > FUSE_SYMLINK_MAX)
-		return -ENAMETOOLONG;
-
-	req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -399,6 +461,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 		inode->i_nlink = 0;
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -424,6 +487,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (!err) {
 		entry->d_inode->i_nlink = 0;
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -459,6 +523,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 		fuse_invalidate_attr(olddir);
 		if (olddir != newdir)
 			fuse_invalidate_attr(newdir);
+
+		/* newent will end up negative */
+		if (newent->d_inode)
+			fuse_invalidate_entry_cache(newent);
 	} else if (err == -EINTR) {
 		/* If request was interrupted, DEITY only knows if the
 		   rename actually took place.  If the invalidation
@@ -566,6 +634,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	return 0;
 }
 
+/*
+ * Check whether the inode attributes are still valid
+ *
+ * If the attribute validity timeout has expired, then fetch the fresh
+ * attributes with a 'getattr' request
+ *
+ * I'm not sure why cached attributes are never returned for the root
+ * inode, this is probably being too cautious.
+ */
 static int fuse_revalidate(struct dentry *entry)
 {
 	struct inode *inode = entry->d_inode;
@@ -613,6 +690,19 @@ static int fuse_access(struct inode *inode, int mask)
 	return err;
 }
 
+/*
+ * Check permission.  The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode.  This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation.  An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent.  Execute permission is still checked
+ * locally based on file mode.
+ */
 static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -631,14 +721,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 				err = generic_permission(inode, mask, NULL);
 		}
 
-		/* FIXME: Need some mechanism to revoke permissions:
-		   currently if the filesystem suddenly changes the
-		   file mode, we will not be informed about it, and
-		   continue to allow access to the file/directory.
-
-		   This is actually not so grave, since the user can
-		   simply keep access to the file/directory anyway by
-		   keeping it open... */
+		/* Note: the opposite of the above test does not
+		   exist.  So if permissions are revoked this won't be
+		   noticed immediately, only after the attribute
+		   timeout has expired */
 
 		return err;
 	} else {
@@ -691,7 +777,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	struct page *page;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -806,6 +897,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	}
 }
 
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case.  So do the rlimit
+ * checking by hand and call vmtruncate() only after the file has
+ * actually been truncated.
+ */
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
 	struct inode *inode = entry->d_inode;
@@ -883,23 +983,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	return err;
 }
 
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
-{
-	struct inode *inode;
-	int err;
-
-	err = fuse_lookup_iget(dir, entry, &inode);
-	if (err)
-		return ERR_PTR(err);
-	if (inode && dir_alias(inode)) {
-		iput(inode);
-		return ERR_PTR(-EIO);
-	}
-	d_add(entry, inode);
-	return NULL;
-}
-
 static int fuse_setxattr(struct dentry *entry, const char *name,
 			 const void *value, size_t size, int flags)
 {
@@ -909,9 +992,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	struct fuse_setxattr_in inarg;
 	int err;
 
-	if (size > FUSE_XATTR_SIZE_MAX)
-		return -E2BIG;
-
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca86141d13..05dedddf428 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -163,6 +163,9 @@ static int fuse_flush(struct file *file)
 	struct fuse_flush_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if (fc->no_flush)
 		return 0;
 
@@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	struct fuse_fsync_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
@@ -272,16 +278,22 @@ static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
-	struct fuse_req *req = fuse_get_request(fc);
-	int err = -EINTR;
+	struct fuse_req *req;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = -EINTR;
+	req = fuse_get_request(fc);
 	if (!req)
 		goto out;
 
 	req->out.page_zeroing = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+	fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
@@ -295,7 +307,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 static int fuse_send_readpages(struct fuse_req *req, struct file *file,
 			       struct inode *inode)
 {
-	loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	unsigned i;
 	req->out.page_zeroing = 1;
@@ -345,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_readpages_data data;
 	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	data.file = file;
 	data.inode = inode;
 	data.req = fuse_get_request(fc);
@@ -402,8 +418,13 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	unsigned count = to - offset;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
-	struct fuse_req *req = fuse_get_request(fc);
+	loff_t pos = page_offset(page) + offset;
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -454,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 
 	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+	npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
 	npages = get_user_pages(current, current->mm, user_addr, npages, write,
 				0, req->pages, NULL);
@@ -475,12 +496,16 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	ssize_t res = 0;
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
 	while (count) {
-		size_t tmp;
 		size_t nres;
 		size_t nbytes = min(count, nmax);
 		int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -488,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 			res = err;
 			break;
 		}
-		tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-		nbytes = min(nbytes, tmp);
+		nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+		nbytes = min(count, nbytes);
 		if (write)
 			nres = fuse_send_write(req, file, inode, pos, nbytes);
 		else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86b..74c8d098a14 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,9 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -108,9 +111,6 @@ struct fuse_out {
 	struct fuse_arg args[3];
 };
 
-struct fuse_req;
-struct fuse_conn;
-
 /**
  * A request to the client
  */
@@ -159,7 +159,8 @@ struct fuse_req {
 	union {
 		struct fuse_forget_in forget_in;
 		struct fuse_release_in release_in;
-		struct fuse_init_in_out init_in_out;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
 	} misc;
 
 	/** page vector */
@@ -272,6 +273,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Negotiated minor version */
+	unsigned minor;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d..04c80cc957a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 		fuse_init_common(inode);
 		init_special_inode(inode, inode->i_mode,
 				   new_decode_dev(attr->rdev));
-	} else {
-		/* Don't let user create weird files */
-		inode->i_mode = S_IFREG;
-		fuse_init_common(inode);
-		fuse_init_file_inode(inode);
-	}
+	} else
+		BUG();
 }
 
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -218,6 +214,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 {
 	stbuf->f_type    = FUSE_SUPER_MAGIC;
 	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
 	stbuf->f_blocks  = attr->blocks;
 	stbuf->f_bfree   = attr->bfree;
 	stbuf->f_bavail  = attr->bavail;
@@ -238,10 +235,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	if (!req)
 		return -EINTR;
 
+	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].size =
+		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
 	request_send(fc, req);
 	err = req->out.h.error;
@@ -482,7 +481,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_IN / 2;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8093351bd7c..6daaf7c755a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,7 +320,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	/* temporarily use utf8 to correctly find the hidden dir below */
 	nls = sbi->nls;
 	sbi->nls = load_nls("utf8");
-	if (!nls) {
+	if (!sbi->nls) {
 		printk("HFS+: unable to load nls for utf8\n");
 		err = -EINVAL;
 		goto cleanup;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3bb67..8c41315a6e4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 
-	if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
-		return -EINVAL;
-
 	if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
 		return -EINVAL;
 
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d..cb3cef525c3 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction. 
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 
-static inline void __buffer_unlink(struct journal_head *jh)
+static void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.  
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction. 
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Didn't somebody clean up the transaction in the meanwhile */
+	if (journal->j_checkpoint_transactions != transaction ||
+		transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
-		}
-
-		/*
-		 * This is foul
-		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
 		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		put_bh(bh);
+		ret = 1;
+	}
+	else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	}
+	else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		put_bh(bh);
+		ret = 1;
+	}
+	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.  
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+ 	if (journal->j_checkpoint_transactions == transaction ||
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,
+						&batch_count);
+			if (!retry &&
+			    lock_need_resched(&journal->j_list_lock)) {
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble. 
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list.  Let's clean up the second one.
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal)
 /* Checkpoint list management */
 
 /*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
+	int ret = 0, released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +571,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.  
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667..6d2dfed1de0 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	nfserr;
+	int	err, nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
-				SVCFH_fmt(&argp->fh));
+		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	err = vfs_getattr(resp->fh.fh_export->ex_mnt,
+			  resp->fh.fh_dentry, &resp->stat);
+	nfserr = nfserrno(err);
+
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d0..243d94b9653 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-
-	*p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
-	if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
-		p = xdr_encode_hyper(p, (u64) stat.size);
+		p = xdr_encode_hyper(p, (u64) stat->size);
 	}
-	p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9);
-	*p++ = htonl((u32) MAJOR(stat.rdev));
-	*p++ = htonl((u32) MINOR(stat.rdev));
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
 	else
-		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev));
-	p = xdr_encode_hyper(p, (u64) stat.ino);
-	p = encode_time3(p, &stat.atime);
+		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
+	p = xdr_encode_hyper(p, (u64) stat->ino);
+	p = encode_time3(p, &stat->atime);
 	lease_get_mtime(dentry->d_inode, &time); 
 	p = encode_time3(p, &time);
-	p = encode_time3(p, &stat.ctime);
+	p = encode_time3(p, &stat->ctime);
 
 	return p;
 }
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode != NULL) {
-		*p++ = xdr_one;		/* attributes follow */
-		return encode_fattr3(rqstp, p, fhp);
+	        int err;
+		struct kstat stat;
+
+		err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
 	}
 	*p++ = xdr_zero;
 	return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	if (resp->status == 0)
-		p = encode_fattr3(rqstp, p, &resp->fh);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e..aa7bb41b293 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	int type;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-	type = (stat.mode & S_IFMT);
+	type = (stat->mode & S_IFMT);
 
 	*p++ = htonl(nfs_ftypes[type >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
 
-	if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) {
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
 	} else {
-		*p++ = htonl((u32) stat.size);
+		*p++ = htonl((u32) stat->size);
 	}
-	*p++ = htonl((u32) stat.blksize);
+	*p++ = htonl((u32) stat->blksize);
 	if (S_ISCHR(type) || S_ISBLK(type))
-		*p++ = htonl(new_encode_dev(stat.rdev));
+		*p++ = htonl(new_encode_dev(stat->rdev));
 	else
 		*p++ = htonl(0xffffffff);
-	*p++ = htonl((u32) stat.blocks);
+	*p++ = htonl((u32) stat->blocks);
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
 	else
-		*p++ = htonl(new_encode_dev(stat.dev));
-	*p++ = htonl((u32) stat.ino);
-	*p++ = htonl((u32) stat.atime.tv_sec);
-	*p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0);
+		*p++ = htonl(new_encode_dev(stat->dev));
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
 	lease_get_mtime(dentry->d_inode, &time); 
 	*p++ = htonl((u32) time.tv_sec);
 	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
-	*p++ = htonl((u32) stat.ctime.tv_sec);
-	*p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0);
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
 
 	return p;
 }
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /* Helper function for NFSv2 ACL code */
 u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
-	return encode_fattr(rqstp, p, fhp);
+	struct kstat stat;
+	vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
+	return encode_fattr(rqstp, p, fhp, &stat);
 }
 
 /*
@@ -394,7 +394,7 @@ int
 nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_attrstat *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_diropres *resp)
 {
 	p = encode_fh(p, &resp->fh);
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -428,7 +428,7 @@ int
 nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readres *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->count);
 	xdr_ressize_check(rqstp, p);
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b..df4019f0456 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
  * As this calls fsync (not fdatasync) there is no need for a write_inode
  * after it.
  */
-static inline void nfsd_dosync(struct file *filp, struct dentry *dp,
-			       struct file_operations *fop)
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+			      struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
+	int err = nfs_ok;
 
 	filemap_fdatawrite(inode->i_mapping);
 	if (fop && (fsync = fop->fsync))
-		fsync(filp, dp, 0);
+		err=fsync(filp, dp, 0);
 	filemap_fdatawait(inode->i_mapping);
+
+	return nfserrno(err);
 }
 	
 
-static void
+static int
 nfsd_sync(struct file *filp)
 {
+        int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
 	down(&inode->i_sem);
-	nfsd_dosync(filp, filp->f_dentry, filp->f_op);
+	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
 	up(&inode->i_sem);
+
+	return err;
 }
 
 void
@@ -874,6 +880,16 @@ out:
 	return err;
 }
 
+static void kill_suid(struct dentry *dentry)
+{
+	struct iattr	ia;
+	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
+
+	down(&dentry->d_inode->i_sem);
+	notify_change(dentry, &ia);
+	up(&dentry->d_inode->i_sem);
+}
+
 static inline int
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
@@ -927,14 +943,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	/* clear setuid/setgid flag after write */
-	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) {
-		struct iattr	ia;
-		ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
-
-		down(&inode->i_sem);
-		notify_change(dentry, &ia);
-		up(&inode->i_sem);
-	}
+	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+		kill_suid(dentry);
 
 	if (err >= 0 && stable) {
 		static ino_t	last_ino;
@@ -962,7 +972,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 			if (inode->i_state & I_DIRTY) {
 				dprintk("nfsd: write sync %d\n", current->pid);
-				nfsd_sync(file);
+				err=nfsd_sync(file);
 			}
 #if 0
 			wake_up(&inode->i_wait);
@@ -1066,7 +1076,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
-			nfsd_sync(file);
+			err = nfsd_sync(file);
 		} else {
 			err = nfserr_notsupp;
 		}
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43431b..e227a04261a 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
 
 config IBM_PARTITION
 	bool "IBM disk label and partition support"
-	depends on PARTITION_ADVANCED && ARCH_S390
+	depends on PARTITION_ADVANCED && S390
 	help
 	  Say Y here if you would like to be able to read the hard disk
 	  partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb2d73..78010ad60e4 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct hd_geometry *geo;
 	char type[5] = {0,};
 	char name[7] = {0,};
-	struct vtoc_volume_label *vlabel;
+	union label_t {
+		struct vtoc_volume_label vol;
+		struct vtoc_cms_label cms;
+	} *label;
 	unsigned char *data;
 	Sector sect;
 
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		goto out_noinfo;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
 		goto out_nogeo;
-	if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label),
-			      GFP_KERNEL)) == NULL)
-		goto out_novlab;
+	if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
+		goto out_nolab;
 	
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		strncpy(name, data + 8, 6);
 	else
 		strncpy(name, data + 4, 6);
-	memcpy (vlabel, data, sizeof(struct vtoc_volume_label));
+	memcpy(label, data, sizeof(union label_t));
 	put_dev_sector(sect);
 
 	EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		/*
 		 * VM style CMS1 labeled disk
 		 */
-		int *label = (int *) vlabel;
-
-		if (label[13] != 0) {
+		if (label->cms.disk_offset != 0) {
 			printk("CMS1/%8s(MDSK):", name);
 			/* disk is reserved minidisk */
-			blocksize = label[3];
-			offset = label[13];
-			size = (label[7] - 1)*(blocksize >> 9);
+			blocksize = label->cms.block_size;
+			offset = label->cms.disk_offset;
+			size = (label->cms.block_count - 1) * (blocksize >> 9);
 		} else {
 			printk("CMS1/%8s:", name);
 			offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		printk("VOL1/%8s:", name);
 
 		/* get block number and read then go through format1 labels */
-		blk = cchhb2blk(&vlabel->vtoc, geo) + 1;
+		blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
 		counter = 0;
 		while ((data = read_dev_sector(bdev, blk*(blocksize/512),
 					       &sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 
 	printk("\n");
-	kfree(vlabel);
+	kfree(label);
 	kfree(geo);
 	kfree(info);
 	return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 out_readerr:
 out_badsect:
 out_noioctl:
-	kfree(vlabel);
-out_novlab:
+	kfree(label);
+out_nolab:
 	kfree(geo);
 out_nogeo:
 	kfree(info);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b30..5e9251f6531 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 	buffer = task_sig(task, buffer);
 	buffer = task_cap(task, buffer);
 	buffer = cpuset_task_status_allowed(task, buffer);
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
 	return buffer - orig;
diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f300709..5a0236e02ee 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
 
 obj-$(CONFIG_RAMFS) += ramfs.o
 
-ramfs-objs := inode.o
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 00000000000..2115383dcc8
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= simple_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.llseek		= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 00000000000..3f810acd0bf
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.prepare_write		= simple_prepare_write,
+	.commit_write		= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= generic_file_read,
+	.write			= generic_file_write,
+	.fsync			= simple_sync_file,
+	.sendfile		= generic_file_sendfile,
+	.llseek			= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	struct pagevec lru_pvec;
+	unsigned long npages, xpages, loop, limit;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		goto too_big;
+
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && newsize > limit)
+		goto fsize_exceeded;
+
+	if (newsize > inode->i_sb->s_maxbytes)
+		goto too_big;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (loop = 0; loop < npages; loop++)
+		set_page_count(pages + loop, 1);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	pagevec_init(&lru_pvec, 0);
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		unlock_page(page);
+	}
+
+	pagevec_lru_add(&lru_pvec);
+	return 0;
+
+ fsize_exceeded:
+	send_sig(SIGXFSZ, current, 0);
+ too_big:
+	return -EFBIG;
+
+ add_error:
+	page_cache_release(pages + loop);
+	for (loop++; loop < npages; loop++)
+		__free_page(pages + loop);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+				      size_t newsize, size_t size)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	/* search for VMAs that fall within the dead zone */
+	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+			      newsize >> PAGE_SHIFT,
+			      (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+			      ) {
+		/* found one - only interested if it's shared out of the page
+		 * cache */
+		if (vma->vm_flags & VM_SHARED)
+			return -ETXTBSY; /* not quite true, but near enough */
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = ramfs_nommu_check_mappings(inode, newsize, size);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = vmtruncate(inode, size);
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* by providing our own setattr() method, we skip this quotaism */
+	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = i_size_read(inode);
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	ret = inode_setattr(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+ out:
+	if (pages) {
+		ptr = pages;
+		for (loop = lpages; loop > 0; loop--)
+			put_page(*ptr++);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605a..c66bd5e4c05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
 #include <linux/ramfs.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
 static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
 static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 	return error;
 }
 
-static struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
-	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
-	.llseek		= generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
 static struct inode_operations ramfs_dir_inode_operations = {
 	.create		= ramfs_create,
 	.lookup		= simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 00000000000..272c8a7120b
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;