aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c14
-rw-r--r--fs/9p/vfs_dir.c93
-rw-r--r--fs/9p/vfs_inode.c5
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/afs/file.c15
-rw-r--r--fs/bio.c28
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/ctree.h19
-rw-r--r--fs/btrfs/disk-io.c50
-rw-r--r--fs/btrfs/extent-tree.c365
-rw-r--r--fs/btrfs/extent_io.c42
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c44
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c239
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/transaction.c64
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c56
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/cachefiles/interface.c32
-rw-r--r--fs/cachefiles/namei.c187
-rw-r--r--fs/cachefiles/rdwr.c130
-rw-r--r--fs/cifs/CHANGES9
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c11
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/misc.c14
-rw-r--r--fs/cifs/readdir.c7
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/dlm/lowcomms.c36
-rw-r--r--fs/ecryptfs/Kconfig3
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/exec.c12
-rw-r--r--fs/ext3/fsync.c36
-rw-r--r--fs/ext3/inode.c36
-rw-r--r--fs/ext3/super.c15
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents.c36
-rw-r--r--fs/ext4/inode.c24
-rw-r--r--fs/ext4/namei.c16
-rw-r--r--fs/ext4/super.c20
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c1
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fscache/Kconfig7
-rw-r--r--fs/fscache/Makefile1
-rw-r--r--fs/fscache/cache.c5
-rw-r--r--fs/fscache/cookie.c26
-rw-r--r--fs/fscache/internal.h56
-rw-r--r--fs/fscache/main.c6
-rw-r--r--fs/fscache/object-list.c432
-rw-r--r--fs/fscache/object.c104
-rw-r--r--fs/fscache/operation.c120
-rw-r--r--fs/fscache/page.c273
-rw-r--r--fs/fscache/proc.c13
-rw-r--r--fs/fscache/stats.c94
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/acl.c357
-rw-r--r--fs/gfs2/acl.h24
-rw-r--r--fs/gfs2/aops.c20
-rw-r--r--fs/gfs2/dir.c34
-rw-r--r--fs/gfs2/glock.c31
-rw-r--r--fs/gfs2/glock.h9
-rw-r--r--fs/gfs2/glops.c5
-rw-r--r--fs/gfs2/incore.h5
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/ops_fstype.c154
-rw-r--r--fs/gfs2/quota.c393
-rw-r--r--fs/gfs2/quota.h5
-rw-r--r--fs/gfs2/recovery.c4
-rw-r--r--fs/gfs2/rgrp.c14
-rw-r--r--fs/gfs2/super.c110
-rw-r--r--fs/gfs2/super.h4
-rw-r--r--fs/gfs2/sys.c14
-rw-r--r--fs/gfs2/xattr.c74
-rw-r--r--fs/gfs2/xattr.h8
-rw-r--r--fs/hfs/btree.c5
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/inode.c10
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/read.c9
-rw-r--r--fs/namespace.c20
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/fscache.c10
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c17
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nilfs2/btnode.c4
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/ioctl.c39
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/notify/dnotify/dnotify.c3
-rw-r--r--fs/notify/inode_mark.c6
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ocfs2/file.c3
-rw-r--r--fs/ocfs2/ocfs2.h7
-rw-r--r--fs/ocfs2/refcounttree.c69
-rw-r--r--fs/ocfs2/super.c20
-rw-r--r--fs/ocfs2/uptodate.c5
-rw-r--r--fs/open.c27
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/pipe.c41
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/quota/Kconfig2
-rw-r--r--fs/quota/dquot.c93
-rw-r--r--fs/quota/quota.c93
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/sysfs/dir.c7
-rw-r--r--fs/sysfs/file.c14
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c41
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c36
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_ialloc.c1
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c21
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_trans_ail.c23
-rw-r--r--fs/xfs/xfs_vnodeops.c6
157 files changed, 3825 insertions, 1388 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 51c94e26a34..e777961939f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
BUG_ON(!vcookie->fscache);
- if (PageFsCache(page)) {
- if (fscache_check_page_write(vcookie->fscache, page)) {
- if (!(gfp & __GFP_WAIT))
- return 0;
- fscache_wait_on_page_write(vcookie->fscache, page);
- }
-
- fscache_uncache_page(vcookie->fscache, page);
- ClearPageFsCache(page);
- }
-
- return 1;
+ return fscache_maybe_release_page(vcookie->fscache, page, gfp);
}
void __v9fs_fscache_invalidate_page(struct page *page)
@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
fscache_wait_on_page_write(vcookie->fscache, page);
BUG_ON(!PageLocked(page));
fscache_uncache_page(vcookie->fscache, page);
- ClearPageFsCache(page);
}
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 873cd31baa4..15cce53bf61 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -40,6 +40,24 @@
#include "fid.h"
/**
+ * struct p9_rdir - readdir accounting
+ * @mutex: mutex protecting readdir
+ * @head: start offset of current dirread buffer
+ * @tail: end offset of current dirread buffer
+ * @buf: dirread buffer
+ *
+ * private structure for keeping track of readdir
+ * allocated on demand
+ */
+
+struct p9_rdir {
+ struct mutex mutex;
+ int head;
+ int tail;
+ uint8_t *buf;
+};
+
+/**
* dt_type - return file type
* @mistat: mistat structure
*
@@ -70,56 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
int over;
struct p9_wstat st;
- int err;
+ int err = 0;
struct p9_fid *fid;
int buflen;
- char *statbuf;
- int n, i = 0;
+ int reclen = 0;
+ struct p9_rdir *rdir;
P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
fid = filp->private_data;
buflen = fid->clnt->msize - P9_IOHDRSZ;
- statbuf = kmalloc(buflen, GFP_KERNEL);
- if (!statbuf)
- return -ENOMEM;
-
- while (1) {
- err = v9fs_file_readn(filp, statbuf, NULL, buflen,
- fid->rdir_fpos);
- if (err <= 0)
- break;
-
- n = err;
- while (i < n) {
- err = p9stat_read(statbuf + i, buflen-i, &st,
- fid->clnt->dotu);
+
+ /* allocate rdir on demand */
+ if (!fid->rdir) {
+ rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+
+ if (rdir == NULL) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ spin_lock(&filp->f_dentry->d_lock);
+ if (!fid->rdir) {
+ rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
+ mutex_init(&rdir->mutex);
+ rdir->head = rdir->tail = 0;
+ fid->rdir = (void *) rdir;
+ rdir = NULL;
+ }
+ spin_unlock(&filp->f_dentry->d_lock);
+ kfree(rdir);
+ }
+ rdir = (struct p9_rdir *) fid->rdir;
+
+ err = mutex_lock_interruptible(&rdir->mutex);
+ while (err == 0) {
+ if (rdir->tail == rdir->head) {
+ err = v9fs_file_readn(filp, rdir->buf, NULL,
+ buflen, filp->f_pos);
+ if (err <= 0)
+ goto unlock_and_exit;
+
+ rdir->head = 0;
+ rdir->tail = err;
+ }
+
+ while (rdir->head < rdir->tail) {
+ err = p9stat_read(rdir->buf + rdir->head,
+ buflen - rdir->head, &st,
+ fid->clnt->dotu);
if (err) {
P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
err = -EIO;
p9stat_free(&st);
- goto free_and_exit;
+ goto unlock_and_exit;
}
-
- i += st.size+2;
- fid->rdir_fpos += st.size+2;
+ reclen = st.size+2;
over = filldir(dirent, st.name, strlen(st.name),
filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
- filp->f_pos += st.size+2;
-
p9stat_free(&st);
if (over) {
err = 0;
- goto free_and_exit;
+ goto unlock_and_exit;
}
+ rdir->head += reclen;
+ filp->f_pos += reclen;
}
}
-free_and_exit:
- kfree(statbuf);
+unlock_and_exit:
+ mutex_unlock(&rdir->mutex);
+exit:
return err;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5947628aefe..18f74ec4dce 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
P9_DPRINTK(P9_DEBUG_VFS,
"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
- retval = buflen;
-
+ retval = strnlen(buffer, buflen);
done:
kfree(st);
return retval;
@@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
__putname(link);
link = ERR_PTR(len);
} else
- link[len] = 0;
+ link[min(len, PATH_MAX-1)] = 0;
}
nd_set_link(nd, link);
diff --git a/fs/Kconfig b/fs/Kconfig
index d4bf8caad8d..64d44efad7a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL
config HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
- (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
+ depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
+ SYS_SUPPORTS_HUGETLBFS || BROKEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 681c2a7b013..39b301662f2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
fscache_wait_on_page_write(vnode->cache, page);
fscache_uncache_page(vnode->cache, page);
- ClearPageFsCache(page);
}
#endif
@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page)) {
- if (fscache_check_page_write(vnode->cache, page)) {
- if (!(gfp_flags & __GFP_WAIT)) {
- _leave(" = F [cache busy]");
- return 0;
- }
- fscache_wait_on_page_write(vnode->cache, page);
- }
-
- fscache_uncache_page(vnode->cache, page);
- ClearPageFsCache(page);
+ if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
+ _leave(" = F [cache busy]");
+ return 0;
}
#endif
diff --git a/fs/bio.c b/fs/bio.c
index 402cb84a92a..12da5db8682 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio)
* @gfp_mask: allocation mask to use
* @nr_iovecs: number of iovecs
*
- * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask
- * contains __GFP_WAIT, the allocation is guaranteed to succeed.
+ * bio_alloc will allocate a bio and associated bio_vec array that can hold
+ * at least @nr_iovecs entries. Allocations will be done from the
+ * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
+ *
+ * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
+ * a bio. This is due to the mempool guarantees. To make this work, callers
+ * must never allocate more than 1 bio at a time from this pool. Callers
+ * that need to allocate more than 1 bio must always submit the previously
+ * allocated bio for IO before attempting to allocate a new one. Failure to
+ * do so can cause livelocks under memory pressure.
*
* RETURNS:
* Pointer to new bio on success, NULL on failure.
@@ -350,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio)
}
/**
- * bio_alloc - allocate a bio for I/O
+ * bio_kmalloc - allocate a bio for I/O using kmalloc()
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
*
* Description:
- * bio_alloc will allocate a bio and associated bio_vec array that can hold
- * at least @nr_iovecs entries. Allocations will be done from the
- * fs_bio_set. Also see @bio_alloc_bioset.
- *
- * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- * a bio. This is due to the mempool guarantees. To make this work, callers
- * must never allocate more than 1 bio at a time from this pool. Callers
- * that need to allocate more than 1 bio must always submit the previously
- * allocated bio for IO before attempting to allocate a new one. Failure to
- * do so can cause livelocks under memory pressure.
+ * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
+ * %__GFP_WAIT, the allocation is guaranteed to succeed.
*
**/
struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
@@ -407,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio);
*
* Description:
* Put a reference to a &struct bio, either one you have gotten with
- * bio_alloc or bio_get. The last put of a bio will free it.
+ * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
**/
void bio_put(struct bio *bio)
{
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9cf4b926f8e..8bed0557d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1248,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
} else {
- put_disk(disk);
module_put(disk->fops->owner);
+ put_disk(disk);
disk = NULL;
if (bdev->bd_contains == bdev) {
if (bdev->bd_disk->fops->open) {
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 69b355ae7f4..36160424427 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
#include "btrfs_inode.h"
#include "xattr.h"
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
.set = btrfs_xattr_acl_access_set,
};
-#else /* CONFIG_BTRFS_POSIX_ACL */
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
int btrfs_acl_chmod(struct inode *inode)
{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
return 0;
}
-#endif /* CONFIG_BTRFS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2f..c0861e781cd 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
};
/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time. It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations. This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+ struct btrfs_work work;
+ struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+ struct worker_start *start;
+ start = container_of(work, struct worker_start, work);
+ btrfs_start_workers(start->queue, 1);
+ kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+ struct worker_start *start;
+ int ret;
+
+ start = kzalloc(sizeof(*start), GFP_NOFS);
+ if (!start)
+ return -ENOMEM;
+
+ start->work.func = start_new_worker_func;
+ start->queue = queue;
+ ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+ if (ret)
+ kfree(start);
+ return ret;
+}
+
+/*
* helper function to move a thread onto the idle list after it
* has finished some requests.
*/
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
goto out;
workers->atomic_start_pending = 0;
- if (workers->num_workers >= workers->max_workers)
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers)
goto out;
+ workers->num_workers_starting += 1;
spin_unlock_irqrestore(&workers->lock, flags);
- btrfs_start_workers(workers, 1);
+ start_new_worker(workers);
return;
out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
/*
* simple init on struct btrfs_workers
*/
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_helper)
{
workers->num_workers = 0;
+ workers->num_workers_starting = 0;
INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
workers->name = name;
workers->ordered = 0;
workers->atomic_start_pending = 0;
- workers->atomic_worker_start = 0;
+ workers->atomic_worker_start = async_helper;
}
/*
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+ int num_workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
workers->num_workers++;
+ workers->num_workers_starting--;
+ WARN_ON(workers->num_workers_starting < 0);
spin_unlock_irq(&workers->lock);
}
return 0;
@@ -452,6 +504,14 @@ fail:
return ret;
}
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+ spin_lock_irq(&workers->lock);
+ workers->num_workers_starting += num_workers;
+ spin_unlock_irq(&workers->lock);
+ return __btrfs_start_workers(workers, num_workers);
+}
+
/*
* run through the list and find a worker thread that doesn't have a lot
* to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
struct list_head *next;
- int enforce_min = workers->num_workers < workers->max_workers;
+ int enforce_min;
+
+ enforce_min = (workers->num_workers + workers->num_workers_starting) <
+ workers->max_workers;
/*
* if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
worker = next_worker(workers);
if (!worker) {
- if (workers->num_workers >= workers->max_workers) {
+ if (workers->num_workers + workers->num_workers_starting >=
+ workers->max_workers) {
goto fallback;
} else if (workers->atomic_worker_start) {
workers->atomic_start_pending = 1;
goto fallback;
} else {
+ workers->num_workers_starting++;
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
- btrfs_start_workers(workers, 1);
+ __btrfs_start_workers(workers, 1);
goto again;
}
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec1..5077746cf85 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
/* current number of running workers */
int num_workers;
+ int num_workers_starting;
+
/* max number of workers allowed. changed by btrfs_start_workers */
int max_workers;
@@ -78,9 +80,10 @@ struct btrfs_workers {
/*
* are we allowed to sleep while starting workers or are we required
- * to start them at a later time?
+ * to start them at a later time? If we can't sleep, this indicates
+ * which queue we need to use to schedule thread creation.
*/
- int atomic_worker_start;
+ struct btrfs_workers *atomic_worker_start;
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+ struct btrfs_workers *async_starter);
int btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a54d354cefc..f6783a42f01 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
* transid of the trans_handle that last modified this inode
*/
u64 last_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ u64 last_sub_trans;
+
/*
* transid that last logged this inode
*/
@@ -128,12 +134,14 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
- * These two counters are for delalloc metadata reservations. We keep
- * track of how many extents we've accounted for vs how many extents we
- * have.
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
*/
- int delalloc_reserved_extents;
- int delalloc_extents;
+ spinlock_t accounting_lock;
+ int reserved_extents;
+ int outstanding_extents;
/*
* ordered_data_close is set by truncate when a file that used
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd8ced9814c..444b3e9b92a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -691,14 +691,17 @@ struct btrfs_space_info {
struct list_head list;
+ /* for controlling how we free up space for allocations */
+ wait_queue_head_t allocate_wait;
+ wait_queue_head_t flush_wait;
+ int allocating_chunk;
+ int flushing;
+
/* for block groups in our same type */
struct list_head block_groups;
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
-
- int allocating_chunk;
- wait_queue_head_t wait;
};
/*
@@ -907,6 +910,7 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
+ struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
struct btrfs_workers endio_workers;
@@ -914,6 +918,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
+ struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -1004,7 +1009,10 @@ struct btrfs_root {
atomic_t log_writers;
atomic_t log_commit[2];
unsigned long log_transid;
+ unsigned long last_log_commit;
unsigned long log_batch;
+ pid_t log_start_pid;
+ bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -1145,6 +1153,7 @@ struct btrfs_root {
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
+#define BTRFS_MOUNT_DISCARD (1 << 10)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2323,7 +2332,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
-extern struct dentry_operations btrfs_dentry_operations;
+extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2366,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
/* acl.c */
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
int btrfs_check_acl(struct inode *inode, int mask);
#else
#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index af0435f79fa..02b6afbd745 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_writers, 0);
root->log_batch = 0;
root->log_transid = 0;
+ root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1087,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->last_log_commit = 0;
return 0;
}
@@ -1746,21 +1748,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -EINVAL;
goto fail_iput;
}
-printk("thread pool is %d\n", fs_info->thread_pool_size);
- /*
- * we need to start all the end_io workers up front because the
- * queue work function gets called at interrupt time, and so it
- * cannot dynamically grow.
- */
+
+ btrfs_init_workers(&fs_info->generic_worker,
+ "genwork", 1, NULL);
+
btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size));
+ fs_info->thread_pool_size),
+ &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->enospc_workers, "enospc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1774,15 +1780,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
fs_info->delalloc_workers.idle_thresh = 2;
fs_info->delalloc_workers.ordered = 1;
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size);
+ "endio-meta-write", fs_info->thread_pool_size,
+ &fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1794,12 +1805,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
- fs_info->endio_workers.atomic_worker_start = 1;
- fs_info->endio_meta_workers.atomic_worker_start = 1;
- fs_info->endio_write_workers.atomic_worker_start = 1;
- fs_info->endio_meta_write_workers.atomic_worker_start = 1;
-
btrfs_start_workers(&fs_info->workers, 1);
+ btrfs_start_workers(&fs_info->generic_worker, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -1807,6 +1814,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
+ btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2012,6 +2020,7 @@ fail_chunk_root:
free_extent_buffer(chunk_root->node);
free_extent_buffer(chunk_root->commit_root);
fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2020,6 +2029,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2437,6 +2447,7 @@ int close_ctree(struct btrfs_root *root)
iput(fs_info->btree_inode);
+ btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2445,6 +2456,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 359a754c782..94627c4cc19 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1568,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-#ifdef BIO_RW_DISCARD
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
DISCARD_FL_BARRIER);
}
-#endif
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
u64 num_bytes)
{
-#ifdef BIO_RW_DISCARD
int ret;
u64 map_length = num_bytes;
struct btrfs_multi_bio *multi = NULL;
+ if (!btrfs_test_opt(root, DISCARD))
+ return 0;
+
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
bytenr, &map_length, &multi, 0);
@@ -1604,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
return ret;
-#else
- return 0;
-#endif
}
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2824,14 +2821,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
num_items);
spin_lock(&meta_sinfo->lock);
- if (BTRFS_I(inode)->delalloc_reserved_extents <=
- BTRFS_I(inode)->delalloc_extents) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ if (BTRFS_I(inode)->reserved_extents <=
+ BTRFS_I(inode)->outstanding_extents) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
spin_unlock(&meta_sinfo->lock);
return 0;
}
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->delalloc_reserved_extents--;
- BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+ BTRFS_I(inode)->reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
if (meta_sinfo->bytes_delalloc < num_bytes) {
bug = true;
@@ -2864,6 +2864,107 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
meta_sinfo->force_delalloc = 0;
}
+struct async_flush {
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+ struct btrfs_work work;
+};
+
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+ struct async_flush *async;
+ struct btrfs_root *root;
+ struct btrfs_space_info *info;
+
+ async = container_of(work, struct async_flush, work);
+ root = async->root;
+ info = async->info;
+
+ btrfs_start_delalloc_inodes(root);
+ wake_up(&info->flush_wait);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+
+ kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+ DEFINE_WAIT(wait);
+ u64 used;
+
+ while (1) {
+ prepare_to_wait(&info->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&info->lock);
+ if (!info->flushing) {
+ spin_unlock(&info->lock);
+ break;
+ }
+
+ used = info->bytes_used + info->bytes_reserved +
+ info->bytes_pinned + info->bytes_readonly +
+ info->bytes_super + info->bytes_root +
+ info->bytes_may_use + info->bytes_delalloc;
+ if (used < info->total_bytes) {
+ spin_unlock(&info->lock);
+ break;
+ }
+ spin_unlock(&info->lock);
+ schedule();
+ }
+ finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct async_flush *async;
+ bool wait = false;
+
+ spin_lock(&info->lock);
+
+ if (!info->flushing) {
+ info->flushing = 1;
+ init_waitqueue_head(&info->flush_wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_on_flush(info);
+ return;
+ }
+
+ async = kzalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ goto flush;
+
+ async->root = root;
+ async->info = info;
+ async->work.func = flush_delalloc_async;
+
+ btrfs_queue_worker(&root->fs_info->enospc_workers,
+ &async->work);
+ wait_on_flush(info);
+ return;
+
+flush:
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
+
+ spin_lock(&info->lock);
+ info->flushing = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->flush_wait);
+}
+
static int maybe_allocate_chunk(struct btrfs_root *root,
struct btrfs_space_info *info)
{
@@ -2876,10 +2977,10 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
free_space = btrfs_super_total_bytes(disk_super);
/*
- * we allow the metadata to grow to a max of either 5gb or 5% of the
+ * we allow the metadata to grow to a max of either 10gb or 5% of the
* space in the volume.
*/
- min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+ min_metadata = min((u64)10 * 1024 * 1024 * 1024,
div64_u64(free_space * 5, 100));
if (info->total_bytes >= min_metadata) {
spin_unlock(&info->lock);
@@ -2894,7 +2995,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
if (!info->allocating_chunk) {
info->force_alloc = 1;
info->allocating_chunk = 1;
- init_waitqueue_head(&info->wait);
+ init_waitqueue_head(&info->allocate_wait);
} else {
wait = true;
}
@@ -2902,7 +3003,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
spin_unlock(&info->lock);
if (wait) {
- wait_event(info->wait,
+ wait_event(info->allocate_wait,
!info->allocating_chunk);
return 1;
}
@@ -2923,7 +3024,7 @@ out:
spin_lock(&info->lock);
info->allocating_chunk = 0;
spin_unlock(&info->lock);
- wake_up(&info->wait);
+ wake_up(&info->allocate_wait);
if (ret)
return 0;
@@ -2981,21 +3082,20 @@ again:
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ flush_delalloc(root, meta_sinfo);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
printk(KERN_ERR "enospc, has %d, reserved %d\n",
- BTRFS_I(inode)->delalloc_extents,
- BTRFS_I(inode)->delalloc_reserved_extents);
+ BTRFS_I(inode)->outstanding_extents,
+ BTRFS_I(inode)->reserved_extents);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
- BTRFS_I(inode)->delalloc_reserved_extents++;
+ BTRFS_I(inode)->reserved_extents++;
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
@@ -3094,8 +3194,7 @@ again:
}
if (retries == 2) {
- btrfs_start_delalloc_inodes(root);
- btrfs_wait_ordered_extents(root, 0);
+ flush_delalloc(root, meta_sinfo);
goto again;
}
spin_lock(&meta_sinfo->lock);
@@ -3588,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
if (is_data)
goto pinit;
+ /*
+ * discard is sloooow, and so triggering discards on
+ * individual btree blocks isn't a good plan. Just
+ * pin everything in discard mode.
+ */
+ if (btrfs_test_opt(root, DISCARD))
+ goto pinit;
+
buf = btrfs_find_tree_block(root, bytenr, num_bytes);
if (!buf)
goto pinit;
@@ -3995,7 +4102,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
}
enum btrfs_loop_type {
- LOOP_CACHED_ONLY = 0,
+ LOOP_FIND_IDEAL = 0,
LOOP_CACHING_NOWAIT = 1,
LOOP_CACHING_WAIT = 2,
LOOP_ALLOC_CHUNK = 3,
@@ -4024,11 +4131,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group = NULL;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
+ int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
+ bool failed_alloc = false;
+ u64 ideal_cache_percent = 0;
+ u64 ideal_cache_offset = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -4064,14 +4175,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
empty_cluster = 0;
if (search_start == hint_byte) {
+ideal_cache:
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
+ *
+ * However if we are re-searching with an ideal block group
+ * picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, data) &&
- block_group_cache_done(block_group)) {
+ (block_group->cached != BTRFS_CACHE_NO ||
+ search_start == ideal_cache_offset)) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -4083,13 +4199,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
*/
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
- } else
+ } else {
goto have_block_group;
+ }
} else if (block_group) {
btrfs_put_block_group(block_group);
}
}
-
search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -4101,28 +4217,45 @@ search:
have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ u64 free_percent;
+
+ free_percent = btrfs_block_group_used(&block_group->item);
+ free_percent *= 100;
+ free_percent = div64_u64(free_percent,
+ block_group->key.offset);
+ free_percent = 100 - free_percent;
+ if (free_percent > ideal_cache_percent &&
+ likely(!block_group->ro)) {
+ ideal_cache_offset = block_group->key.objectid;
+ ideal_cache_percent = free_percent;
+ }
+
/*
- * we want to start caching kthreads, but not too many
- * right off the bat so we don't overwhelm the system,
- * so only start them if there are less than 2 and we're
- * in the initial allocation phase.
+ * We only want to start kthread caching if we are at
+ * the point where we will wait for caching to make
+ * progress, or if our ideal search is over and we've
+ * found somebody to start caching.
*/
if (loop > LOOP_CACHING_NOWAIT ||
- atomic_read(&space_info->caching_threads) < 2) {
+ (loop > LOOP_FIND_IDEAL &&
+ atomic_read(&space_info->caching_threads) < 2)) {
ret = cache_block_group(block_group);
BUG_ON(ret);
}
- }
-
- cached = block_group_cache_done(block_group);
- if (unlikely(!cached)) {
found_uncached_bg = true;
- /* if we only want cached bgs, loop */
- if (loop == LOOP_CACHED_ONLY)
+ /*
+ * If loop is set for cached only, try the next block
+ * group.
+ */
+ if (loop == LOOP_FIND_IDEAL)
goto loop;
}
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached))
+ found_uncached_bg = true;
+
if (unlikely(block_group->ro))
goto loop;
@@ -4233,14 +4366,23 @@ refill_cluster:
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
- if (!offset && (cached || (!cached &&
- loop == LOOP_CACHING_NOWAIT))) {
- goto loop;
- } else if (!offset && (!cached &&
- loop > LOOP_CACHING_NOWAIT)) {
+ /*
+ * If we didn't find a chunk, and we haven't failed on this
+ * block group before, and this block group is in the middle of
+ * caching and we are ok with waiting, then go ahead and wait
+ * for progress to be made, and set failed_alloc to true.
+ *
+ * If failed_alloc is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !failed_alloc && !cached &&
+ loop > LOOP_CACHING_NOWAIT) {
wait_block_group_cache_progress(block_group,
- num_bytes + empty_size);
+ num_bytes + empty_size);
+ failed_alloc = true;
goto have_block_group;
+ } else if (!offset) {
+ goto loop;
}
checks:
search_start = stripe_align(root, offset);
@@ -4288,13 +4430,16 @@ checks:
break;
loop:
failed_cluster_refill = false;
+ failed_alloc = false;
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
- /* LOOP_CACHED_ONLY, only search fully cached block groups
- * LOOP_CACHING_NOWAIT, search partially cached block groups, but
- * dont wait foR them to finish caching
+ /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
+ * for them to make caching progress. Also
+ * determine the best possible bg to cache
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -4303,12 +4448,47 @@ loop:
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
(found_uncached_bg || empty_size || empty_cluster ||
allowed_chunk_alloc)) {
- if (found_uncached_bg) {
+ if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
- if (loop < LOOP_CACHING_WAIT) {
- loop++;
+ loop++;
+ if (!ideal_cache_percent &&
+ atomic_read(&space_info->caching_threads))
goto search;
- }
+
+ /*
+ * 1 of the following 2 things have happened so far
+ *
+ * 1) We found an ideal block group for caching that
+ * is mostly full and will cache quickly, so we might
+ * as well wait for it.
+ *
+ * 2) We searched for cached only and we didn't find
+ * anything, and we didn't start any caching kthreads
+ * either, so chances are we will loop through and
+ * start a couple caching kthreads, and then come back
+ * around and just wait for them. This will be slower
+ * because we will have 2 caching kthreads reading at
+ * the same time when we could have just started one
+ * and waited for it to get far enough to give us an
+ * allocation, so go ahead and go to the wait caching
+ * loop.
+ */
+ loop = LOOP_CACHING_WAIT;
+ search_start = ideal_cache_offset;
+ ideal_cache_percent = 0;
+ goto ideal_cache;
+ } else if (loop == LOOP_FIND_IDEAL) {
+ /*
+ * Didn't find a uncached bg, wait on anything we find
+ * next.
+ */
+ loop = LOOP_CACHING_WAIT;
+ goto search;
+ }
+
+ if (loop < LOOP_CACHING_WAIT) {
+ loop++;
+ goto search;
}
if (loop == LOOP_ALLOC_CHUNK) {
@@ -4320,7 +4500,8 @@ loop:
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data, 1);
allowed_chunk_alloc = 0;
- } else {
+ done_chunk_alloc = 1;
+ } else if (!done_chunk_alloc) {
space_info->force_alloc = 1;
}
@@ -4799,6 +4980,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 refs;
+ u64 flags;
u64 last = 0;
u32 nritems;
u32 blocksize;
@@ -4836,15 +5018,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
generation <= root->root_key.offset)
continue;
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &refs, &flags);
+ BUG_ON(ret);
+ BUG_ON(refs == 0);
+
if (wc->stage == DROP_REFERENCE) {
- ret = btrfs_lookup_extent_info(trans, root,
- bytenr, blocksize,
- &refs, NULL);
- BUG_ON(ret);
- BUG_ON(refs == 0);
if (refs == 1)
goto reada;
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
if (!wc->update_ref ||
generation <= root->root_key.offset)
continue;
@@ -4853,6 +5039,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
&wc->update_progress);
if (ret < 0)
continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
}
reada:
ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4876,7 +5066,7 @@ reada:
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc)
+ struct walk_control *wc, int lookup_info)
{
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
@@ -4891,8 +5081,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
- (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
ret = btrfs_lookup_extent_info(trans, root,
eb->start, eb->len,
@@ -4953,7 +5144,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc)
+ struct walk_control *wc, int *lookup_info)
{
u64 bytenr;
u64 generation;
@@ -4973,8 +5164,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
- generation <= root->root_key.offset)
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
return 1;
+ }
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
blocksize = btrfs_level_size(root, level - 1);
@@ -4987,14 +5180,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- if (wc->stage == DROP_REFERENCE) {
- ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
- &wc->refs[level - 1],
- &wc->flags[level - 1]);
- BUG_ON(ret);
- BUG_ON(wc->refs[level - 1] == 0);
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level - 1] == 0);
+ *lookup_info = 0;
+ if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
@@ -5008,12 +5206,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
wc->stage = UPDATE_BACKREF;
wc->shared_level = level - 1;
}
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
}
if (!btrfs_buffer_uptodate(next, generation)) {
btrfs_tree_unlock(next);
free_extent_buffer(next);
next = NULL;
+ *lookup_info = 1;
}
if (!next) {
@@ -5036,21 +5239,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
skip:
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- parent = path->nodes[level]->start;
- } else {
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level]));
- parent = 0;
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
}
-
- ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0);
- BUG_ON(ret);
-
btrfs_tree_unlock(next);
free_extent_buffer(next);
+ *lookup_info = 1;
return 1;
}
@@ -5164,6 +5368,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
int level = wc->level;
+ int lookup_info = 1;
int ret;
while (level >= 0) {
@@ -5171,14 +5376,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
btrfs_header_nritems(path->nodes[level]))
break;
- ret = walk_down_proc(trans, root, path, wc);
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
if (ret > 0)
break;
if (level == 0)
break;
- ret = do_walk_down(trans, root, path, wc);
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
if (ret > 0) {
path->slots[level]++;
continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index de1793ba004..96577e8bf9f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state, int bits, int wake,
int delete)
{
- int ret = state->state & bits;
+ int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int ret = state->state & bits_to_clear;
if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
- state->state &= ~bits;
+ state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
if (delete || state->state == 0) {
@@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0,
NULL, mask);
}
@@ -1401,12 +1403,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_pages,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback,
- int set_private2)
+ unsigned long op)
{
int ret;
struct page *pages[16];
@@ -1416,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int i;
int clear_bits = 0;
- if (clear_unlock)
+ if (op & EXTENT_CLEAR_UNLOCK)
clear_bits |= EXTENT_LOCKED;
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_bits |= EXTENT_DIRTY;
- if (clear_delalloc)
+ if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
+ if (op & EXTENT_CLEAR_ACCOUNTING)
+ clear_bits |= EXTENT_DO_ACCOUNTING;
+
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
- set_private2))
+ if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+ EXTENT_SET_PRIVATE2)))
return 0;
while (nr_pages > 0) {
@@ -1435,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
nr_pages, ARRAY_SIZE(pages)), pages);
for (i = 0; i < ret; i++) {
- if (set_private2)
+ if (op & EXTENT_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
if (pages[i] == locked_page) {
page_cache_release(pages[i]);
continue;
}
- if (clear_dirty)
+ if (op & EXTENT_CLEAR_DIRTY)
clear_page_dirty_for_io(pages[i]);
- if (set_writeback)
+ if (op & EXTENT_SET_WRITEBACK)
set_page_writeback(pages[i]);
- if (end_writeback)
+ if (op & EXTENT_END_WRITEBACK)
end_page_writeback(pages[i]);
- if (unlock_pages)
+ if (op & EXTENT_CLEAR_UNLOCK_PAGE)
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
@@ -2714,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
lock_extent(tree, start, end, GFP_NOFS);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
1, 1, NULL, GFP_NOFS);
return 0;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4794ec891fe..36de250a7b2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
@@ -25,6 +26,16 @@
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK 0x2
+#define EXTENT_CLEAR_DELALLOC 0x4
+#define EXTENT_CLEAR_DIRTY 0x8
+#define EXTENT_SET_WRITEBACK 0x10
+#define EXTENT_END_WRITEBACK 0x20
+#define EXTENT_SET_PRIVATE2 0x40
+#define EXTENT_CLEAR_ACCOUNTING 0x80
+
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
@@ -288,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int extent_clear_unlock_delalloc(struct inode *inode,
struct extent_io_tree *tree,
u64 start, u64 end, struct page *locked_page,
- int unlock_page,
- int clear_unlock,
- int clear_delalloc, int clear_dirty,
- int set_writeback,
- int end_writeback,
- int set_private2);
+ unsigned long op);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2c726b7b9fa..ccbdcb54ec5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(em->start != start || !em);
+ WARN_ON(!em || em->start != start);
if (!em)
goto out;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f19e1259a97..06550affbd2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -878,7 +878,8 @@ again:
btrfs_put_ordered_extent(ordered);
clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, GFP_NOFS);
@@ -1085,8 +1086,10 @@ out_nolock:
btrfs_end_transaction(trans, root);
else
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
}
}
if (file->f_flags & O_DIRECT) {
@@ -1136,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
int ret = 0;
struct btrfs_trans_handle *trans;
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
/*
* check the transaction that last modified this inode
* and see if its already been committed
@@ -1143,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (!BTRFS_I(inode)->last_trans)
goto out;
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
mutex_lock(&root->fs_info->trans_mutex);
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
@@ -1152,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
}
mutex_unlock(&root->fs_info->trans_mutex);
- root->log_batch++;
- filemap_fdatawrite(inode->i_mapping);
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- root->log_batch++;
-
- if (datasync && !(inode->i_state & I_DIRTY_PAGES))
- goto out;
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
@@ -1187,14 +1195,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
*/
mutex_unlock(&dentry->d_inode->i_mutex);
- if (ret > 0) {
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- ret = btrfs_end_transaction(trans, root);
- else
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5c2caad7621..cb2849f0325 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1296,7 +1296,7 @@ again:
window_start = entry->offset;
window_free = entry->bytes;
last = entry;
- max_extent = 0;
+ max_extent = entry->bytes;
} else {
last = next;
window_free += next->bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 112e5aa8589..b3ad168a0bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -424,9 +424,12 @@ again:
* and free up our temp pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 0,
- 0, 1, 1, 1, 0);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
ret = 0;
goto free_pages_out;
}
@@ -535,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree;
- int ret;
+ int ret = 0;
if (list_empty(&async_cow->extents))
return 0;
@@ -549,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
io_tree = &BTRFS_I(inode)->io_tree;
+retry:
/* did the compression code fall back to uncompressed IO? */
if (!async_extent->pages) {
int page_started = 0;
@@ -559,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->ram_size - 1, GFP_NOFS);
/* allocate blocks */
- cow_file_range(inode, async_cow->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ ret = cow_file_range(inode, async_cow->locked_page,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0);
/*
* if page_started, cow_file_range inserted an
@@ -571,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
* and IO for us. Otherwise, we need to submit
* all those pages down to the drive.
*/
- if (!page_started)
+ if (!page_started && !ret)
extent_write_locked_range(io_tree,
inode, async_extent->start,
async_extent->start +
@@ -599,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->compressed_size,
0, alloc_hint,
(u64)-1, &ins, 1);
- BUG_ON(ret);
+ if (ret) {
+ int i;
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
+ goto retry;
+ }
+
em = alloc_extent_map(GFP_NOFS);
em->start = async_extent->start;
em->len = async_extent->ram_size;
@@ -637,11 +655,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
* clear dirty, set writeback and unlock the pages.
*/
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, 1, 1, 0, 1, 1, 0, 0);
+ &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
ret = btrfs_submit_compressed_write(inode,
async_extent->start,
@@ -712,9 +733,15 @@ static noinline int cow_file_range(struct inode *inode,
start, end, 0, NULL);
if (ret == 0) {
extent_clear_unlock_delalloc(inode,
- &BTRFS_I(inode)->io_tree,
- start, end, NULL, 1, 1,
- 1, 1, 1, 1, 0);
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_ACCOUNTING |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
@@ -731,13 +758,29 @@ static noinline int cow_file_range(struct inode *inode,
em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
start, num_bytes);
if (em) {
- alloc_hint = em->block_start;
- free_extent_map(em);
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
}
read_unlock(&BTRFS_I(inode)->extent_tree.lock);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
+ unsigned long op;
+
cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
@@ -789,10 +832,13 @@ static noinline int cow_file_range(struct inode *inode,
* Do set the Private2 bit so we know this page was properly
* setup for writepage
*/
+ op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+ op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2;
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
start, start + ram_size - 1,
- locked_page, unlock, 1,
- 1, 0, 0, 0, 1);
+ locked_page, op);
disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +910,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
- EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+ 1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
async_cow->inode = inode;
@@ -1006,6 +1052,7 @@ next_slot:
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
+ extent_type = 0;
goto out_check;
}
@@ -1112,8 +1159,10 @@ out_check:
BUG_ON(ret);
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
- cur_offset, cur_offset + num_bytes - 1,
- locked_page, 1, 1, 1, 0, 0, 0, 1);
+ cur_offset, cur_offset + num_bytes - 1,
+ locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+ EXTENT_SET_PRIVATE2);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1178,15 +1227,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
root->fs_info->max_extent);
/*
- * if we break a large extent up then leave delalloc_extents be,
- * since we've already accounted for the large extent.
+ * if we break a large extent up then leave oustanding_extents
+ * be, since we've already accounted for the large extent.
*/
if (div64_u64(new_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent) < num_extents)
return 0;
}
- BTRFS_I(inode)->delalloc_extents++;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
return 0;
}
@@ -1217,7 +1268,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= root->fs_info->max_extent) {
- BTRFS_I(inode)->delalloc_extents--;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
return 0;
}
@@ -1231,7 +1284,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
root->fs_info->max_extent) > num_extents)
return 0;
- BTRFS_I(inode)->delalloc_extents--;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
return 0;
}
@@ -1253,7 +1308,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- BTRFS_I(inode)->delalloc_extents++;
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1281,8 +1338,12 @@ static int btrfs_clear_bit_hook(struct inode *inode,
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
- BTRFS_I(inode)->delalloc_extents--;
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (bits & EXTENT_DO_ACCOUNTING) {
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ }
spin_lock(&root->fs_info->delalloc_lock);
if (state->end - state->start + 1 >
@@ -2442,7 +2503,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
root = BTRFS_I(dir)->root;
+ /*
+ * 5 items for unlink inode
+ * 1 for orphan
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 6);
+ return PTR_ERR(trans);
+ }
btrfs_set_trans_block_group(trans, dir);
@@ -2457,6 +2530,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -2537,7 +2611,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
+ ret = btrfs_reserve_metadata_space(root, 5);
+ if (ret)
+ return ret;
+
trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_unreserve_metadata_space(root, 5);
+ return PTR_ERR(trans);
+ }
+
btrfs_set_trans_block_group(trans, dir);
if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -2560,6 +2643,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
+ btrfs_unreserve_metadata_space(root, 5);
btrfs_btree_balance_dirty(root, nr);
if (ret && !err)
@@ -3000,12 +3084,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
+ ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret)
+ goto out;
ret = -ENOMEM;
again:
page = grab_cache_page(mapping, index);
- if (!page)
+ if (!page) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
goto out;
+ }
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -3038,6 +3132,10 @@ again:
goto again;
}
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
+
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3056,6 +3154,9 @@ again:
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ if (ret)
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
unlock_page(page);
page_cache_release(page);
out:
@@ -3079,7 +3180,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
if (size <= hole_start)
return 0;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (err)
+ return err;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -3448,6 +3551,7 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->generation = 0;
bi->sequence = 0;
bi->last_trans = 0;
+ bi->last_sub_trans = 0;
bi->logged_trans = 0;
bi->delalloc_bytes = 0;
bi->reserved_bytes = 0;
@@ -3598,12 +3702,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
{
struct btrfs_root *root;
- if (!dentry->d_inode)
- return 0;
+ if (!dentry->d_inode && !IS_ROOT(dentry))
+ dentry = dentry->d_parent;
- root = BTRFS_I(dentry->d_inode)->root;
- if (btrfs_root_refs(&root->root_item) == 0)
- return 1;
+ if (dentry->d_inode) {
+ root = BTRFS_I(dentry->d_inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ }
return 0;
}
@@ -4808,7 +4914,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+ NULL, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -4821,8 +4928,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
- 1, 1, NULL, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -4917,7 +5024,8 @@ again:
* prepare_pages in the normal write path.
*/
clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+ EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+ GFP_NOFS);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
@@ -4944,7 +5052,9 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
@@ -4969,7 +5079,9 @@ static void btrfs_truncate(struct inode *inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
- btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ if (ret)
+ return;
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
@@ -5064,9 +5176,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
ei->last_trans = 0;
+ ei->last_sub_trans = 0;
ei->logged_trans = 0;
- ei->delalloc_extents = 0;
- ei->delalloc_reserved_extents = 0;
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+ ei->root = NULL;
+ spin_lock_init(&ei->accounting_lock);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5082,6 +5197,14 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(inode->i_data.nrpages);
/*
+ * This can happen where we create an inode, but somebody else also
+ * created the same inode and we need to destroy the one we already
+ * created.
+ */
+ if (!root)
+ goto free;
+
+ /*
* Make sure we're properly removed from the ordered operation
* lists.
*/
@@ -5116,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode)
}
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -5221,11 +5345,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
return -ENOTEMPTY;
/*
- * 2 items for dir items
- * 1 item for orphan entry
- * 1 item for ref
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
*/
- ret = btrfs_reserve_metadata_space(root, 4);
+ ret = btrfs_reserve_metadata_space(root, 11);
if (ret)
return ret;
@@ -5341,7 +5468,7 @@ out_fail:
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
- btrfs_unreserve_metadata_space(root, 4);
+ btrfs_unreserve_metadata_space(root, 11);
return ret;
}
@@ -5805,6 +5932,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.removexattr = btrfs_removexattr,
};
-struct dentry_operations btrfs_dentry_operations = {
+const struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9a780c8d0ac..cdbb054102b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -830,6 +830,7 @@ out_up_write:
out_unlock:
mutex_unlock(&inode->i_mutex);
if (!err) {
+ shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
}
@@ -1122,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datao += off - key.offset;
datal -= off - key.offset;
}
- if (key.offset + datao + datal > off + len)
- datal = off + len - key.offset - datao;
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 897fba835f8..5799bc46a30 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+ inode, 1);
+
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faa..cfcc93c93a7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
BUG_ON(!rc->block_group);
btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size);
+ fs_info->thread_pool_size, NULL);
rc->extent_root = extent_root;
btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
mapping_tree_init(&rc->reloc_root_tree);
INIT_LIST_HEAD(&rc->reloc_roots);
btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size);
+ root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9351428f30e..67fa2d29d66 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
write_extent_buffer(l, item, ptr, sizeof(*item));
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
return ret;
}
@@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
BUG_ON(refs != 0);
ret = btrfs_del_item(trans, root, path);
out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9de9b223641..752a5463bf5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
- Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+ Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+ Opt_discard, Opt_err,
};
static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
{Opt_notreelog, "notreelog"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
+ {Opt_discard, "discard"},
{Opt_err, NULL},
};
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->metadata_ratio);
}
break;
+ case Opt_discard:
+ btrfs_set_opt(info->mount_opt, DISCARD);
+ break;
default:
break;
}
@@ -344,7 +349,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0b8f36d4400..c207e8c32c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root)
}
}
+enum btrfs_trans_type {
+ TRANS_START,
+ TRANS_JOIN,
+ TRANS_USERSPACE,
+};
+
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- int num_blocks, int wait)
+ int num_blocks, int type)
{
struct btrfs_trans_handle *h =
kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
mutex_lock(&root->fs_info->trans_mutex);
if (!root->fs_info->log_root_recovering &&
- ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
wait_current_trans(root);
ret = join_transaction(root);
BUG_ON(ret);
@@ -186,7 +193,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
- if (!current->journal_info)
+ if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
root->fs_info->running_transaction->use_count++;
@@ -198,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, 1);
+ return start_transaction(root, num_blocks, TRANS_START);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, 0);
+ return start_transaction(root, num_blocks, TRANS_JOIN);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
- return start_transaction(r, num_blocks, 2);
+ return start_transaction(r, num_blocks, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -344,10 +351,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * those extents are sent to disk but does not wait on them
*/
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages)
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
{
int ret;
int err = 0;
@@ -394,6 +401,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
page_cache_release(page);
}
}
+ if (err)
+ werr = err;
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
while (1) {
ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
EXTENT_DIRTY);
@@ -424,6 +454,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return werr;
}
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int ret2;
+
+ ret = btrfs_write_marked_extents(root, dirty_pages);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+ return ret || ret2;
+}
+
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c6740491..d4e3e7a6938 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7827841b55c..741666a7676 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (!root->log_start_pid) {
+ root->log_start_pid = current->pid;
+ root->log_multiple_pids = false;
+ } else if (root->log_start_pid != current->pid) {
+ root->log_multiple_pids = true;
+ }
+
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
return 0;
}
+ root->log_multiple_pids = false;
+ root->log_start_pid = current->pid;
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ u64 log_transid = 0;
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
unsigned long batch = root->log_batch;
- mutex_unlock(&root->log_mutex);
- schedule_timeout_uninterruptible(1);
- mutex_lock(&root->log_mutex);
-
+ if (root->log_multiple_pids) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
btrfs_set_root_node(&log->root_item, log->node);
root->log_batch = 0;
+ log_transid = root->log_transid;
root->log_transid++;
log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
smp_mb();
/*
* log tree has been flushed to disk, new modifications of
@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
wait_log_commit(trans, log_root_tree,
log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* check the full commit flag again
*/
if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
BUG_ON(ret);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages);
btrfs_set_super_log_root(&root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
- write_ctree_super(trans, root->fs_info->tree_root, 2);
+ write_ctree_super(trans, root->fs_info->tree_root, 1);
ret = 0;
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
@@ -2852,6 +2876,21 @@ out:
return ret;
}
+static int inode_in_log(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == trans->transid &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -2891,6 +2930,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
+ if (inode_in_log(trans, inode)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
start_log_trans(trans, root);
ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16..0776eacb508 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b0fc93f95fd..b6dd5967c48 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
* attributes are handled directly.
*/
struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
&btrfs_xattr_acl_access_handler,
&btrfs_xattr_acl_default_handler,
#endif
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 431accd475a..27089311fbe 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -114,8 +114,9 @@ nomem_lookup_data:
/*
* attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
*/
-static void cachefiles_lookup_object(struct fscache_object *_object)
+static int cachefiles_lookup_object(struct fscache_object *_object)
{
struct cachefiles_lookup_data *lookup_data;
struct cachefiles_object *parent, *object;
@@ -145,13 +146,15 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
cachefiles_attr_changed(&object->fscache);
- if (ret < 0) {
- printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
- ret);
+ if (ret < 0 && ret != -ETIMEDOUT) {
+ if (ret != -ENOBUFS)
+ printk(KERN_WARNING
+ "CacheFiles: Lookup failed error %d\n", ret);
fscache_object_lookup_error(&object->fscache);
}
_leave(" [%d]", ret);
+ return ret;
}
/*
@@ -331,6 +334,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
}
cache = object->fscache.cache;
+ fscache_object_destroy(&object->fscache);
kmem_cache_free(cachefiles_object_jar, object);
fscache_object_destroyed(cache);
}
@@ -403,12 +407,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
if (oi_size == ni_size)
return 0;
- newattrs.ia_size = ni_size;
- newattrs.ia_valid = ATTR_SIZE;
-
cachefiles_begin_secure(cache, &saved_cred);
mutex_lock(&object->backer->d_inode->i_mutex);
+
+ /* if there's an extension to a partial page at the end of the backing
+ * file, we need to discard the partial page so that we pick up new
+ * data after it */
+ if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+ _debug("discard tail %llx", oi_size);
+ newattrs.ia_valid = ATTR_SIZE;
+ newattrs.ia_size = oi_size & PAGE_MASK;
+ ret = notify_change(object->backer, &newattrs);
+ if (ret < 0)
+ goto truncate_failed;
+ }
+
+ newattrs.ia_valid = ATTR_SIZE;
+ newattrs.ia_size = ni_size;
ret = notify_change(object->backer, &newattrs);
+
+truncate_failed:
mutex_unlock(&object->backer->d_inode->i_mutex);
cachefiles_end_secure(cache, saved_cred);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 4ce818ae39e..14ac4806e29 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -21,17 +21,81 @@
#include <linux/security.h>
#include "internal.h"
-static int cachefiles_wait_bit(void *flags)
+#define CACHEFILES_KEYBUF_SIZE 512
+
+/*
+ * dump debugging info about an object
+ */
+static noinline
+void __cachefiles_printk_object(struct cachefiles_object *object,
+ const char *prefix,
+ u8 *keybuf)
{
- schedule();
- return 0;
+ struct fscache_cookie *cookie;
+ unsigned keylen, loop;
+
+ printk(KERN_ERR "%sobject: OBJ%x\n",
+ prefix, object->fscache.debug_id);
+ printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+ prefix, fscache_object_states[object->fscache.state],
+ object->fscache.flags, object->fscache.work.flags,
+ object->fscache.events,
+ object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+ printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+ prefix, object->fscache.n_ops, object->fscache.n_in_progress,
+ object->fscache.n_exclusive);
+ printk(KERN_ERR "%sparent=%p\n",
+ prefix, object->fscache.parent);
+
+ spin_lock(&object->fscache.lock);
+ cookie = object->fscache.cookie;
+ if (cookie) {
+ printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+ prefix,
+ object->fscache.cookie,
+ object->fscache.cookie->parent,
+ object->fscache.cookie->netfs_data,
+ object->fscache.cookie->flags);
+ if (keybuf)
+ keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
+ CACHEFILES_KEYBUF_SIZE);
+ else
+ keylen = 0;
+ } else {
+ printk(KERN_ERR "%scookie=NULL\n", prefix);
+ keylen = 0;
+ }
+ spin_unlock(&object->fscache.lock);
+
+ if (keylen) {
+ printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+ for (loop = 0; loop < keylen; loop++)
+ printk("%02x", keybuf[loop]);
+ printk("'\n");
+ }
+}
+
+/*
+ * dump debugging info about a pair of objects
+ */
+static noinline void cachefiles_printk_object(struct cachefiles_object *object,
+ struct cachefiles_object *xobject)
+{
+ u8 *keybuf;
+
+ keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
+ if (object)
+ __cachefiles_printk_object(object, "", keybuf);
+ if (xobject)
+ __cachefiles_printk_object(xobject, "x", keybuf);
+ kfree(keybuf);
}
/*
* record the fact that an object is now active
*/
-static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
- struct cachefiles_object *object)
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
{
struct cachefiles_object *xobject;
struct rb_node **_p, *_parent = NULL;
@@ -42,8 +106,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
try_again:
write_lock(&cache->active_lock);
- if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+ if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+ printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+ cachefiles_printk_object(object, NULL);
BUG();
+ }
dentry = object->dentry;
_p = &cache->active_nodes.rb_node;
@@ -66,8 +133,8 @@ try_again:
rb_insert_color(&object->active_node, &cache->active_nodes);
write_unlock(&cache->active_lock);
- _leave("");
- return;
+ _leave(" = 0");
+ return 0;
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
@@ -76,44 +143,70 @@ wait_for_old_object:
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Unexpected object collision\n");
- printk(KERN_ERR "xobject: OBJ%x\n",
- xobject->fscache.debug_id);
- printk(KERN_ERR "xobjstate=%s\n",
- fscache_object_states[xobject->fscache.state]);
- printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
- printk(KERN_ERR "xobjevent=%lx [%lx]\n",
- xobject->fscache.events, xobject->fscache.event_mask);
- printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
- xobject->fscache.n_ops, xobject->fscache.n_in_progress,
- xobject->fscache.n_exclusive);
- printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
- xobject->fscache.cookie,
- xobject->fscache.cookie->parent,
- xobject->fscache.cookie->netfs_data,
- xobject->fscache.cookie->flags);
- printk(KERN_ERR "xparent=%p\n",
- xobject->fscache.parent);
- printk(KERN_ERR "object: OBJ%x\n",
- object->fscache.debug_id);
- printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
- object->fscache.cookie,
- object->fscache.cookie->parent,
- object->fscache.cookie->netfs_data,
- object->fscache.cookie->flags);
- printk(KERN_ERR "parent=%p\n",
- object->fscache.parent);
+ cachefiles_printk_object(object, xobject);
BUG();
}
atomic_inc(&xobject->usage);
write_unlock(&cache->active_lock);
- _debug(">>> wait");
- wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
- cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
- _debug("<<< waited");
+ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+ wait_queue_head_t *wq;
+
+ signed long timeout = 60 * HZ;
+ wait_queue_t wait;
+ bool requeue;
+
+ /* if the object we're waiting for is queued for processing,
+ * then just put ourselves on the queue behind it */
+ if (slow_work_is_queued(&xobject->fscache.work)) {
+ _debug("queue OBJ%x behind OBJ%x immediately",
+ object->fscache.debug_id,
+ xobject->fscache.debug_id);
+ goto requeue;
+ }
+
+ /* otherwise we sleep until either the object we're waiting for
+ * is done, or the slow-work facility wants the thread back to
+ * do other work */
+ wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+ init_wait(&wait);
+ requeue = false;
+ do {
+ prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+ break;
+ requeue = slow_work_sleep_till_thread_needed(
+ &object->fscache.work, &timeout);
+ } while (timeout > 0 && !requeue);
+ finish_wait(wq, &wait);
+
+ if (requeue &&
+ test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+ _debug("queue OBJ%x behind OBJ%x after wait",
+ object->fscache.debug_id,
+ xobject->fscache.debug_id);
+ goto requeue;
+ }
+
+ if (timeout <= 0) {
+ printk(KERN_ERR "\n");
+ printk(KERN_ERR "CacheFiles: Error: Overlong"
+ " wait for old active object to go away\n");
+ cachefiles_printk_object(object, xobject);
+ goto requeue;
+ }
+ }
+
+ ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
cache->cache.ops->put_object(&xobject->fscache);
goto try_again;
+
+requeue:
+ clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+ cache->cache.ops->put_object(&xobject->fscache);
+ _leave(" = -ETIMEDOUT");
+ return -ETIMEDOUT;
}
/*
@@ -254,7 +347,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
dir = dget_parent(object->dentry);
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
ret = cachefiles_bury_object(cache, dir, object->dentry);
dput(dir);
@@ -307,7 +400,7 @@ lookup_again:
/* search the current directory for the element name */
_debug("lookup '%s'", name);
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
start = jiffies;
next = lookup_one_len(name, dir, nlen);
@@ -418,12 +511,15 @@ lookup_again:
}
/* note that we're now using this object */
- cachefiles_mark_object_active(cache, object);
+ ret = cachefiles_mark_object_active(cache, object);
mutex_unlock(&dir->d_inode->i_mutex);
dput(dir);
dir = NULL;
+ if (ret == -ETIMEDOUT)
+ goto mark_active_timed_out;
+
_debug("=== OBTAINED_OBJECT ===");
if (object->new) {
@@ -467,6 +563,10 @@ create_error:
cachefiles_io_error(cache, "Create/mkdir failed");
goto error;
+mark_active_timed_out:
+ _debug("mark active timed out");
+ goto release_dentry;
+
check_error:
_debug("check error %d", ret);
write_lock(&cache->active_lock);
@@ -474,7 +574,7 @@ check_error:
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
write_unlock(&cache->active_lock);
-
+release_dentry:
dput(object->dentry);
object->dentry = NULL;
goto error_out;
@@ -495,9 +595,6 @@ error:
error_out2:
dput(dir);
error_out:
- if (ret == -ENOSPC)
- ret = -ENOBUFS;
-
_leave(" = error %d", -ret);
return ret;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a69787e7dd9..a6c8c6fe8df 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -11,6 +11,7 @@
#include <linux/mount.h>
#include <linux/file.h>
+#include <linux/ima.h>
#include "internal.h"
/*
@@ -40,8 +41,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
_debug("--- monitor %p %lx ---", page, page->flags);
- if (!PageUptodate(page) && !PageError(page))
- dump_stack();
+ if (!PageUptodate(page) && !PageError(page)) {
+ /* unlocked, not uptodate and not erronous? */
+ _debug("page probably truncated");
+ }
/* remove from the waitqueue */
list_del(&wait->task_list);
@@ -61,6 +64,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
}
/*
+ * handle a probably truncated page
+ * - check to see if the page is still relevant and reissue the read if
+ * possible
+ * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
+ * must wait again and 0 if successful
+ */
+static int cachefiles_read_reissue(struct cachefiles_object *object,
+ struct cachefiles_one_read *monitor)
+{
+ struct address_space *bmapping = object->backer->d_inode->i_mapping;
+ struct page *backpage = monitor->back_page, *backpage2;
+ int ret;
+
+ kenter("{ino=%lx},{%lx,%lx}",
+ object->backer->d_inode->i_ino,
+ backpage->index, backpage->flags);
+
+ /* skip if the page was truncated away completely */
+ if (backpage->mapping != bmapping) {
+ kleave(" = -ENODATA [mapping]");
+ return -ENODATA;
+ }
+
+ backpage2 = find_get_page(bmapping, backpage->index);
+ if (!backpage2) {
+ kleave(" = -ENODATA [gone]");
+ return -ENODATA;
+ }
+
+ if (backpage != backpage2) {
+ put_page(backpage2);
+ kleave(" = -ENODATA [different]");
+ return -ENODATA;
+ }
+
+ /* the page is still there and we already have a ref on it, so we don't
+ * need a second */
+ put_page(backpage2);
+
+ INIT_LIST_HEAD(&monitor->op_link);
+ add_page_wait_queue(backpage, &monitor->monitor);
+
+ if (trylock_page(backpage)) {
+ ret = -EIO;
+ if (PageError(backpage))
+ goto unlock_discard;
+ ret = 0;
+ if (PageUptodate(backpage))
+ goto unlock_discard;
+
+ kdebug("reissue read");
+ ret = bmapping->a_ops->readpage(NULL, backpage);
+ if (ret < 0)
+ goto unlock_discard;
+ }
+
+ /* but the page may have been read before the monitor was installed, so
+ * the monitor may miss the event - so we have to ensure that we do get
+ * one in such a case */
+ if (trylock_page(backpage)) {
+ _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+ unlock_page(backpage);
+ }
+
+ /* it'll reappear on the todo list */
+ kleave(" = -EINPROGRESS");
+ return -EINPROGRESS;
+
+unlock_discard:
+ unlock_page(backpage);
+ spin_lock_irq(&object->work_lock);
+ list_del(&monitor->op_link);
+ spin_unlock_irq(&object->work_lock);
+ kleave(" = %d", ret);
+ return ret;
+}
+
+/*
* copy data from backing pages to netfs pages to complete a read operation
* - driven by FS-Cache's thread pool
*/
@@ -92,20 +173,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
_debug("- copy {%lu}", monitor->back_page->index);
- error = -EIO;
+ recheck:
if (PageUptodate(monitor->back_page)) {
copy_highpage(monitor->netfs_page, monitor->back_page);
pagevec_add(&pagevec, monitor->netfs_page);
fscache_mark_pages_cached(monitor->op, &pagevec);
error = 0;
- }
-
- if (error)
+ } else if (!PageError(monitor->back_page)) {
+ /* the page has probably been truncated */
+ error = cachefiles_read_reissue(object, monitor);
+ if (error == -EINPROGRESS)
+ goto next;
+ goto recheck;
+ } else {
cachefiles_io_error_obj(
object,
"Readpage failed on backing file %lx",
(unsigned long) monitor->back_page->flags);
+ error = -EIO;
+ }
page_cache_release(monitor->back_page);
@@ -114,6 +201,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
fscache_put_retrieval(op);
kfree(monitor);
+ next:
/* let the thread pool have some air occasionally */
max--;
if (max < 0 || need_resched()) {
@@ -333,7 +421,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
- op->op.flags = FSCACHE_OP_FAST;
+ op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+ op->op.flags |= FSCACHE_OP_FAST;
op->op.processor = cachefiles_read_copier;
pagevec_init(&pagevec, 0);
@@ -639,7 +728,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
pagevec_init(&pagevec, 0);
- op->op.flags = FSCACHE_OP_FAST;
+ op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+ op->op.flags |= FSCACHE_OP_FAST;
op->op.processor = cachefiles_read_copier;
INIT_LIST_HEAD(&backpages);
@@ -801,7 +891,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
struct cachefiles_cache *cache;
mm_segment_t old_fs;
struct file *file;
- loff_t pos;
+ loff_t pos, eof;
+ size_t len;
void *data;
int ret;
@@ -832,18 +923,33 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
if (IS_ERR(file)) {
ret = PTR_ERR(file);
} else {
+ ima_counts_get(file);
ret = -EIO;
if (file->f_op->write) {
pos = (loff_t) page->index << PAGE_SHIFT;
+
+ /* we mustn't write more data than we have, so we have
+ * to beware of a partial page at EOF */
+ eof = object->fscache.store_limit_l;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ ASSERTCMP(pos, <, eof);
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
+ }
+ }
+
data = kmap(page);
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = file->f_op->write(
- file, (const void __user *) data, PAGE_SIZE,
- &pos);
+ file, (const void __user *) data, len, &pos);
set_fs(old_fs);
kunmap(page);
- if (ret != PAGE_SIZE)
+ if (ret != len)
ret = -EIO;
}
fput(file);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 145540a316a..094ea65afc8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works).
+
Version 1.60
-------------
Fix memory leak in reconnect. Fix oops in DFS mount error path.
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9a5e4f5f312..29f1da761bb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1037,7 +1037,7 @@ init_cifs(void)
if (rc)
goto out_unregister_key_type;
#endif
- rc = slow_work_register_user();
+ rc = slow_work_register_user(THIS_MODULE);
if (rc)
goto out_unregister_resolver_key;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6928c24d1d4..5646727e33f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -388,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
const struct nls_table *nls_codepage, int remap_special_chars);
extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 43003e0bef1..63ea83ff687 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
out_err:
if (tcp_ses) {
- kfree(tcp_ses->hostname);
+ if (!IS_ERR(tcp_ses->hostname))
+ kfree(tcp_ses->hostname);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -2219,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
{
int rc;
- __u64 inode_num;
FILE_ALL_INFO *pfile_info;
- rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc != -EOPNOTSUPP)
- return rc;
-
pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
if (pfile_info == NULL)
return -ENOMEM;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 627a60a6c1b..1f42f772865 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,8 +214,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
posix_flags |= SMB_O_EXCL;
if (oflags & O_TRUNC)
posix_flags |= SMB_O_TRUNC;
- if (oflags & O_APPEND)
- posix_flags |= SMB_O_APPEND;
if (oflags & O_SYNC)
posix_flags |= SMB_O_SYNC;
if (oflags & O_DIRECTORY)
@@ -643,9 +641,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* O_EXCL: optimize away the lookup, but don't hash the dentry. Let
* the VFS handle the create.
*/
- if (nd->flags & LOOKUP_EXCL) {
+ if (nd && (nd->flags & LOOKUP_EXCL)) {
d_instantiate(direntry, NULL);
- return 0;
+ return NULL;
}
/* can not grab the rename sem here since it would
@@ -675,7 +673,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
* reduction in network traffic in the other paths.
*/
if (pTcon->unix_ext) {
- if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
+ if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
(nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
(nd->intent.open.flags & O_CREAT)) {
rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e2492535da..cababd8a52d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc1) {
+ if (rc1 || !fattr.cf_uniqueid) {
cFYI(1, ("GetSrvInodeNum rc %d", rc1));
fattr.cf_uniqueid = iunique(sb, ROOT_I);
- /* disable serverino if call not supported */
- if (rc1 == -EINVAL)
- cifs_sb->mnt_cifs_flags &=
- ~CIFS_MOUNT_SERVER_INUM;
+ cifs_autodisable_serverino(cifs_sb);
}
} else {
fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0241b25ac33..d27d4ec6579 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -715,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
ctoUCS_out:
return i;
}
+
+void
+cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
+{
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+ cERROR(1, ("Autodisabling the use of server inode numbers on "
+ "%s. This server doesn't seem to support them "
+ "properly. Hardlinks will not be recognized on this "
+ "mount. Consider mounting with the \"noserverino\" "
+ "option to silence this message.",
+ cifs_sb->tcon->treeName));
+ }
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1f098ca7163..f84062f9a98 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
pfindEntry, cifs_sb);
- /* FIXME: make _to_fattr functions fill this out */
- if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+ if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
fattr.cf_uniqueid = inum;
- else
+ } else {
fattr.cf_uniqueid = iunique(sb, ROOT_I);
+ cifs_autodisable_serverino(cifs_sb);
+ }
ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
diff --git a/fs/compat.c b/fs/compat.c
index d576b552e8e..6c19040ffee 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename,
if (retval < 0)
goto out;
+ current->stack_start = current->mm->start_stack;
+
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f91fd51b32e..d84e7058c29 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1800,7 +1800,7 @@ struct space_resv_32 {
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
{
- struct space_resv_32 __user *p32 = (void __user *)arg;
+ struct space_resv_32 __user *p32 = compat_ptr(arg);
struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
@@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
#else
case FS_IOC_RESVSP:
case FS_IOC_RESVSP64:
- error = ioctl_preallocate(filp, (void __user *)arg);
+ error = ioctl_preallocate(filp, compat_ptr(arg));
goto out_fput;
#endif
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 240cef14fe5..70736eb4b51 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
+ /* with sctp there's no connecting without sending */
+ if (dlm_config.ci_protocol != 0)
+ return 0;
+
if (nodeid == dlm_our_nodeid())
return 0;
@@ -455,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
int prim_len, ret;
int addr_len;
struct connection *new_con;
- struct file *file;
sctp_peeloff_arg_t parg;
int parglen = sizeof(parg);
+ int err;
/*
* We get this before any data for an association.
@@ -512,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
SCTP_SOCKOPT_PEELOFF,
(void *)&parg, &parglen);
- if (ret) {
+ if (ret < 0) {
log_print("Can't peel off a socket for "
- "connection %d to node %d: err=%d\n",
+ "connection %d to node %d: err=%d",
parg.associd, nodeid, ret);
+ return;
+ }
+ new_con->sock = sockfd_lookup(parg.sd, &err);
+ if (!new_con->sock) {
+ log_print("sockfd_lookup error %d", err);
+ return;
}
- file = fget(parg.sd);
- new_con->sock = SOCKET_I(file->f_dentry->d_inode);
add_sock(new_con->sock, new_con);
- fput(file);
- put_unused_fd(parg.sd);
+ sockfd_put(new_con->sock);
- log_print("got new/restarted association %d nodeid %d",
- (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+ log_print("connecting to %d sctp association %d",
+ nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -837,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
if (con->retries++ > MAX_CONNECT_RETRIES)
return;
- log_print("Initiating association with node %d", con->nodeid);
-
if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
log_print("no address for nodeid %d", con->nodeid);
return;
@@ -855,11 +860,14 @@ static void sctp_init_assoc(struct connection *con)
outmessage.msg_flags = MSG_EOR;
spin_lock(&con->writequeue_lock);
- e = list_entry(con->writequeue.next, struct writequeue_entry,
- list);
- BUG_ON((struct list_head *) e == &con->writequeue);
+ if (list_empty(&con->writequeue)) {
+ spin_unlock(&con->writequeue_lock);
+ log_print("writequeue empty for nodeid %d", con->nodeid);
+ return;
+ }
+ e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
spin_unlock(&con->writequeue_lock);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 8aadb99b763..1cd6d9d3e29 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,8 +1,9 @@
config ECRYPT_FS
tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && KEYS && NET
+ depends on EXPERIMENTAL && KEYS && CRYPTO
select CRYPTO_ECB
select CRYPTO_CBC
+ select CRYPTO_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 101fe4c7b1e..c6ac85d6c70 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
#include <linux/key.h>
#include <linux/parser.h>
#include <linux/fs_stack.h>
+#include <linux/ima.h>
#include "ecryptfs_kernel.h"
/**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
const struct cred *cred = current_cred();
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+ int opened_lower_file = 0;
int rc = 0;
mutex_lock(&inode_info->lower_file_mutex);
@@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
"for lower_dentry [0x%p] and lower_mnt [0x%p]; "
"rc = [%d]\n", lower_dentry, lower_mnt, rc);
inode_info->lower_file = NULL;
- }
+ } else
+ opened_lower_file = 1;
}
mutex_unlock(&inode_info->lower_file_mutex);
+ if (opened_lower_file)
+ ima_counts_get(inode_info->lower_file);
return rc;
}
diff --git a/fs/exec.c b/fs/exec.c
index d49be6bc179..c0c636e34f6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,6 @@
#include <linux/proc_fs.h>
#include <linux/mount.h>
#include <linux/security.h>
-#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
@@ -624,10 +623,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
/* Move stack pages down in memory. */
if (stack_shift) {
ret = shift_arg_pages(vma, stack_shift);
- if (ret) {
- up_write(&mm->mmap_sem);
- return ret;
- }
+ if (ret)
+ goto out_unlock;
}
#ifdef CONFIG_STACK_GROWSUP
@@ -641,7 +638,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
out_unlock:
up_write(&mm->mmap_sem);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(setup_arg_pages);
@@ -1211,9 +1208,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
retval = security_bprm_check(bprm);
if (retval)
return retval;
- retval = ima_bprm_check(bprm);
- if (retval)
- return retval;
/* kernel module loader fixup */
/* so we don't try to load run modprobe in kernel space. */
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 451d166bbe9..8209f266e9a 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -46,19 +46,21 @@
int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+ journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
int ret = 0;
+ tid_t commit_tid;
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
J_ASSERT(ext3_journal_current_handle() == NULL);
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for a proper transaction
+ * to commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
goto out;
}
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto flush;
+ if (datasync)
+ commit_tid = atomic_read(&ei->i_datasync_tid);
+ else
+ commit_tid = atomic_read(&ei->i_sync_tid);
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- ret = sync_inode(inode, &wbc);
+ if (log_start_commit(journal, commit_tid)) {
+ log_wait_commit(journal, commit_tid);
goto out;
}
-flush:
+
/*
* In case we didn't commit a transaction, we have to flush
* disk caches manually so that data really is on persistent
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index acf1b142332..354ed3b47b3 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
int err = 0;
struct ext3_block_alloc_info *block_i;
ext3_fsblk_t current_block;
+ struct ext3_inode_info *ei = EXT3_I(inode);
- block_i = EXT3_I(inode)->i_block_alloc_info;
+ block_i = ei->i_block_alloc_info;
/*
* If we're splicing into a [td]indirect block (as opposed to the
* inode) then we need to get write access to the [td]indirect block
@@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
inode->i_ctime = CURRENT_TIME_SEC;
ext3_mark_inode_dirty(handle, inode);
+ /* ext3_mark_inode_dirty already updated i_sync_tid */
+ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
/* had we spliced it onto indirect block? */
if (where->bh) {
@@ -1735,6 +1738,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1757,9 +1761,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext3_get_block, NULL);
+ if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -2750,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
struct ext3_inode_info *ei;
struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+ transaction_t *transaction;
long ret;
int block;
@@ -2827,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ atomic_set(&ei->i_sync_tid, tid);
+ atomic_set(&ei->i_datasync_tid, tid);
+ }
+
if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
/*
@@ -3011,6 +3044,7 @@ again:
err = rc;
ei->i_state &= ~EXT3_STATE_NEW;
+ atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
out_brelse:
brelse (bh);
ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 72743d36050..427496c4767 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
return NULL;
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+ atomic_set(&ei->i_datasync_tid, 0);
+ atomic_set(&ei->i_sync_tid, 0);
return &ei->vfs_inode;
}
@@ -2321,7 +2323,18 @@ static int ext3_commit_super(struct super_block *sb,
if (!sbh)
return error;
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 984ca0cb38c..8825515eedd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -322,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -743,6 +744,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 10539e36428..715264b4bae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2807,6 +2807,8 @@ fix_extent_len:
* into three uninitialized extent(at most). After IO complete, the part
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
*/
static int ext4_split_unwritten_extents(handle_t *handle,
struct inode *inode,
@@ -2824,7 +2826,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
- int ret = 0;
ext_debug("ext4_split_unwritten_extents: inode %lu,"
"iblock %llu, max_blocks %u\n", inode->i_ino,
@@ -2842,12 +2843,12 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
/*
- * if the entire unintialized extent length less than
- * the size of extent to write, there is no need to split
- * uninitialized extent
+ * If the uninitialized extent begins at the same logical
+ * block where the write begins, and the write completely
+ * covers the extent, then we don't need to split it.
*/
- if (allocated <= max_blocks)
- return ret;
+ if ((iblock == ee_block) && (allocated <= max_blocks))
+ return allocated;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3048,12 +3049,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ret = ext4_split_unwritten_extents(handle,
inode, path, iblock,
max_blocks, flags);
- /* flag the io_end struct that we need convert when IO done */
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to convertion to written when IO is
+ * completed
+ */
if (io)
io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
goto out;
}
- /* DIO end_io complete, convert the filled extent to written */
+ /* async DIO end_io complete, convert the filled extent to written */
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
ret = ext4_convert_unwritten_extents_dio(handle, inode,
path);
@@ -3295,10 +3302,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* To avoid unecessary convertion for every aio dio rewrite
* to the mid of file, here we flag the IO that is really
* need the convertion.
- *
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform convertion when IO is done.
*/
- if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
- io->flag = DIO_AIO_UNWRITTEN;
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |=
+ EXT4_STATE_DIO_UNWRITTEN;;
+ }
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
@@ -3519,6 +3532,7 @@ retry:
*
* This function is called from the direct IO end io call back
* function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
*/
int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
loff_t len)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5c5bc5dafff..2c8caa51add 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -193,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
int nblocks)
{
int ret;
@@ -209,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
up_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
return ret;
}
@@ -3445,8 +3446,6 @@ out:
return ret;
}
-/* Maximum number of blocks we map for direct IO at once. */
-
static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -3654,13 +3653,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
ext_debug("ext4_end_io_dio(): io_end 0x%p"
"for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
iocb->private, io_end->inode->i_ino, iocb, offset,
size);
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- return;
/* if not aio dio with unwritten extents, just free io and return */
if (io_end->flag != DIO_AIO_UNWRITTEN){
@@ -3771,13 +3771,19 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0)
+ } else if (ret > 0 && (EXT4_I(inode)->i_state &
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
/*
* for non AIO case, since the IO is already
* completed, we could do the convertion right here
*/
- ret = ext4_convert_unwritten_extents(inode,
- offset, ret);
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ }
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 7c8fe80bacd..6d2c1b897fc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
- retval = make_indexed_dir(handle, dentry, inode, bh);
- if (retval == -ENOSPC)
- brelse(bh);
- return retval;
- }
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+ return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
bh = ext4_append(handle, dir, &block, &retval);
@@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
- if (retval == -ENOSPC)
- brelse(bh);
- return retval;
+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
/*
@@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- if (err != -ENOSPC)
- bh = NULL;
+ bh = NULL;
goto cleanup;
journal_error:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 312211ee05a..d4ca92aab51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1300,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb,
*journal_devnum = option;
break;
case Opt_journal_checksum:
- break; /* Kept for backwards compatibility */
+ set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+ break;
case Opt_journal_async_commit:
set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+ set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
set_opt(sbi->s_mount_opt, NOLOAD);
@@ -2759,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
- jbd2_journal_set_features(sbi->s_journal, 0, 0,
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- else
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index fc089f2f7f5..2cf93ec40a6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
type = PIDTYPE_PID;
break;
- case F_OWNER_GID:
+ case F_OWNER_PGRP:
type = PIDTYPE_PGID;
break;
@@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
break;
case PIDTYPE_PGID:
- owner.type = F_OWNER_GID;
+ owner.type = F_OWNER_PGRP;
break;
default:
diff --git a/fs/file.c b/fs/file.c
index f313314f996..87e129030ab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/time.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 8eb44042e00..4bef4c01ec6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
-#include <linux/ima.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
@@ -280,7 +279,6 @@ void __fput(struct file *file)
if (file->f_op && file->f_op->release)
file->f_op->release(inode, file);
security_file_free(file);
- ima_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
cdev_put(inode->i_cdev);
fops_put(file->f_op);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 9bbb8ce7bea..864dac20a24 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -54,3 +54,10 @@ config FSCACHE_DEBUG
enabled by setting bits in /sys/modules/fscache/parameter/debug.
See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_OBJECT_LIST
+ bool "Maintain global object list for debugging purposes"
+ depends on FSCACHE && PROC_FS
+ help
+ Maintain a global list of active fscache objects that can be
+ retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 91571b95aac..6d561531cb3 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -15,5 +15,6 @@ fscache-y := \
fscache-$(CONFIG_PROC_FS) += proc.o
fscache-$(CONFIG_FSCACHE_STATS) += stats.o
fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index e21985bbb1f..6a3c48abd67 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache,
spin_lock(&cache->object_list_lock);
list_add_tail(&ifsdef->cache_link, &cache->object_list);
spin_unlock(&cache->object_list_lock);
+ fscache_objlist_add(ifsdef);
/* add the cache's netfs definition index object to the top level index
* cookie as a known backing object */
@@ -380,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
/* make sure all pages pinned by operations on behalf of the netfs are
* written to disk */
+ fscache_stat(&fscache_n_cop_sync_cache);
cache->ops->sync_cache(cache);
+ fscache_stat_d(&fscache_n_cop_sync_cache);
/* dissociate all the netfs pages backed by this cache from the block
* mappings in the cache */
+ fscache_stat(&fscache_n_cop_dissociate_pages);
cache->ops->dissociate_pages(cache);
+ fscache_stat_d(&fscache_n_cop_dissociate_pages);
/* we now have to destroy all the active objects pertaining to this
* cache - which we do by passing them off to thread pool to be
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 72fd18f6c71..990535071a8 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie)
memset(cookie, 0, sizeof(*cookie));
spin_lock_init(&cookie->lock);
+ spin_lock_init(&cookie->stores_lock);
INIT_HLIST_HEAD(&cookie->backing_objects);
}
@@ -102,7 +103,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
cookie->netfs_data = netfs_data;
cookie->flags = 0;
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+ /* radix tree insertion won't use the preallocation pool unless it's
+ * told it may not wait */
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
@@ -249,7 +252,9 @@ static int fscache_alloc_object(struct fscache_cache *cache,
/* ask the cache to allocate an object (we may end up with duplicate
* objects at this stage, but we sort that out later) */
+ fscache_stat(&fscache_n_cop_alloc_object);
object = cache->ops->alloc_object(cache, cookie);
+ fscache_stat_d(&fscache_n_cop_alloc_object);
if (IS_ERR(object)) {
fscache_stat(&fscache_n_object_no_alloc);
ret = PTR_ERR(object);
@@ -270,8 +275,11 @@ static int fscache_alloc_object(struct fscache_cache *cache,
/* only attach if we managed to allocate all we needed, otherwise
* discard the object we just allocated and instead use the one
* attached to the cookie */
- if (fscache_attach_object(cookie, object) < 0)
+ if (fscache_attach_object(cookie, object) < 0) {
+ fscache_stat(&fscache_n_cop_put_object);
cache->ops->put_object(object);
+ fscache_stat_d(&fscache_n_cop_put_object);
+ }
_leave(" = 0");
return 0;
@@ -287,7 +295,9 @@ object_already_extant:
return 0;
error_put:
+ fscache_stat(&fscache_n_cop_put_object);
cache->ops->put_object(object);
+ fscache_stat_d(&fscache_n_cop_put_object);
error:
_leave(" = %d", ret);
return ret;
@@ -349,6 +359,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
object->cookie = cookie;
atomic_inc(&cookie->usage);
hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+
+ fscache_objlist_add(object);
ret = 0;
cant_attach_object:
@@ -403,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
unsigned long event;
fscache_stat(&fscache_n_relinquishes);
+ if (retire)
+ fscache_stat(&fscache_n_relinquishes_retire);
if (!cookie) {
fscache_stat(&fscache_n_relinquishes_null);
@@ -428,12 +442,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
- /* detach pointers back to the netfs */
spin_lock(&cookie->lock);
- cookie->netfs_data = NULL;
- cookie->def = NULL;
-
/* break links with all the active objects */
while (!hlist_empty(&cookie->backing_objects)) {
object = hlist_entry(cookie->backing_objects.first,
@@ -456,6 +466,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
BUG();
}
+ /* detach pointers back to the netfs */
+ cookie->netfs_data = NULL;
+ cookie->def = NULL;
+
spin_unlock(&cookie->lock);
if (cookie->parent) {
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1c341304621..edd7434ab6e 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -17,6 +17,7 @@
* - cache->object_list_lock
* - object->lock
* - object->parent->lock
+ * - cookie->stores_lock
* - fscache_thread_lock
*
*/
@@ -88,17 +89,31 @@ extern int fscache_wait_bit_interruptible(void *);
/*
* object.c
*/
+extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
+
extern void fscache_withdrawing_object(struct fscache_cache *,
struct fscache_object *);
extern void fscache_enqueue_object(struct fscache_object *);
/*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+
+extern void fscache_objlist_add(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#endif
+
+/*
* operation.c
*/
extern int fscache_submit_exclusive_op(struct fscache_object *,
struct fscache_operation *);
extern int fscache_submit_op(struct fscache_object *,
struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *);
extern void fscache_abort_object(struct fscache_object *);
extern void fscache_start_operations(struct fscache_object *);
extern void fscache_operation_gc(struct work_struct *);
@@ -127,6 +142,8 @@ extern atomic_t fscache_n_op_enqueue;
extern atomic_t fscache_n_op_deferred_release;
extern atomic_t fscache_n_op_release;
extern atomic_t fscache_n_op_gc;
+extern atomic_t fscache_n_op_cancelled;
+extern atomic_t fscache_n_op_rejected;
extern atomic_t fscache_n_attr_changed;
extern atomic_t fscache_n_attr_changed_ok;
@@ -138,6 +155,8 @@ extern atomic_t fscache_n_allocs;
extern atomic_t fscache_n_allocs_ok;
extern atomic_t fscache_n_allocs_wait;
extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
extern atomic_t fscache_n_alloc_ops;
extern atomic_t fscache_n_alloc_op_waits;
@@ -148,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata;
extern atomic_t fscache_n_retrievals_nobufs;
extern atomic_t fscache_n_retrievals_intr;
extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
extern atomic_t fscache_n_retrieval_ops;
extern atomic_t fscache_n_retrieval_op_waits;
@@ -158,6 +178,14 @@ extern atomic_t fscache_n_stores_nobufs;
extern atomic_t fscache_n_stores_oom;
extern atomic_t fscache_n_store_ops;
extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
+
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
extern atomic_t fscache_n_marks;
extern atomic_t fscache_n_uncaches;
@@ -176,6 +204,7 @@ extern atomic_t fscache_n_updates_run;
extern atomic_t fscache_n_relinquishes;
extern atomic_t fscache_n_relinquishes_null;
extern atomic_t fscache_n_relinquishes_waitcrt;
+extern atomic_t fscache_n_relinquishes_retire;
extern atomic_t fscache_n_cookie_index;
extern atomic_t fscache_n_cookie_data;
@@ -186,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc;
extern atomic_t fscache_n_object_lookups;
extern atomic_t fscache_n_object_lookups_negative;
extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
extern atomic_t fscache_n_object_created;
extern atomic_t fscache_n_object_avail;
extern atomic_t fscache_n_object_dead;
@@ -195,15 +225,41 @@ extern atomic_t fscache_n_checkaux_okay;
extern atomic_t fscache_n_checkaux_update;
extern atomic_t fscache_n_checkaux_obsolete;
+extern atomic_t fscache_n_cop_alloc_object;
+extern atomic_t fscache_n_cop_lookup_object;
+extern atomic_t fscache_n_cop_lookup_complete;
+extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_update_object;
+extern atomic_t fscache_n_cop_drop_object;
+extern atomic_t fscache_n_cop_put_object;
+extern atomic_t fscache_n_cop_sync_cache;
+extern atomic_t fscache_n_cop_attr_changed;
+extern atomic_t fscache_n_cop_read_or_alloc_page;
+extern atomic_t fscache_n_cop_read_or_alloc_pages;
+extern atomic_t fscache_n_cop_allocate_page;
+extern atomic_t fscache_n_cop_allocate_pages;
+extern atomic_t fscache_n_cop_write_page;
+extern atomic_t fscache_n_cop_uncache_page;
+extern atomic_t fscache_n_cop_dissociate_pages;
+
static inline void fscache_stat(atomic_t *stat)
{
atomic_inc(stat);
}
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
extern const struct file_operations fscache_stats_fops;
#else
+#define __fscache_stat(stat) (NULL)
#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
#endif
/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b59749..add6bdb53f0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
{
int ret;
- ret = slow_work_register_user();
+ ret = slow_work_register_user(THIS_MODULE);
if (ret < 0)
goto error_slow_work;
@@ -80,7 +80,7 @@ error_kobj:
error_cookie_jar:
fscache_proc_cleanup();
error_proc:
- slow_work_unregister_user();
+ slow_work_unregister_user(THIS_MODULE);
error_slow_work:
return ret;
}
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
kobject_put(fscache_root);
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
- slow_work_unregister_user();
+ slow_work_unregister_user(THIS_MODULE);
printk(KERN_NOTICE "FS-Cache: Unloaded\n");
}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 00000000000..e590242fa41
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+
+struct fscache_objlist_data {
+ unsigned long config; /* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */
+
+ u8 buf[512]; /* key and aux data buffer */
+};
+
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ * tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+ struct fscache_object *xobj;
+ struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+
+ write_lock(&fscache_object_list_lock);
+
+ while (*p) {
+ parent = *p;
+ xobj = rb_entry(parent, struct fscache_object, objlist_link);
+
+ if (obj < xobj)
+ p = &(*p)->rb_left;
+ else if (obj > xobj)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&obj->objlist_link, parent, p);
+ rb_insert_color(&obj->objlist_link, &fscache_object_list);
+
+ write_unlock(&fscache_object_list_lock);
+}
+
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *obj)
+{
+ write_lock(&fscache_object_list_lock);
+
+ BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+ rb_erase(&obj->objlist_link, &fscache_object_list);
+
+ write_unlock(&fscache_object_list_lock);
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+ struct fscache_object *pobj, *obj, *minobj = NULL;
+ struct rb_node *p;
+ unsigned long pos;
+
+ if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+ return NULL;
+ pos = *_pos;
+
+ /* banners (can't represent line 0 by pos 0 as that would involve
+ * returning a NULL pointer) */
+ if (pos == 0)
+ return (struct fscache_object *) ++(*_pos);
+ if (pos < 3)
+ return (struct fscache_object *)pos;
+
+ pobj = (struct fscache_object *)pos;
+ p = fscache_object_list.rb_node;
+ while (p) {
+ obj = rb_entry(p, struct fscache_object, objlist_link);
+ if (pobj < obj) {
+ if (!minobj || minobj > obj)
+ minobj = obj;
+ p = p->rb_left;
+ } else if (pobj > obj) {
+ p = p->rb_right;
+ } else {
+ minobj = obj;
+ break;
+ }
+ obj = NULL;
+ }
+
+ if (!minobj)
+ *_pos = (unsigned long) ERR_PTR(-ENOENT);
+ else if (minobj != obj)
+ *_pos = (unsigned long) minobj;
+ return minobj;
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&fscache_object_list_lock)
+{
+ read_lock(&fscache_object_list_lock);
+ return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ (*_pos)++;
+ return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+ __releases(&fscache_object_list_lock)
+{
+ read_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+ struct fscache_objlist_data *data = m->private;
+ struct fscache_object *obj = v;
+ unsigned long config = data->config;
+ uint16_t keylen, auxlen;
+ char _type[3], *type;
+ bool no_cookie;
+ u8 *buf = data->buf, *p;
+
+ if ((unsigned long) v == 1) {
+ seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
+ " EM EV F S"
+ " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+ if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+ FSCACHE_OBJLIST_CONFIG_AUX))
+ seq_puts(m, " ");
+ if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+ seq_puts(m, "OBJECT_KEY");
+ if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+ FSCACHE_OBJLIST_CONFIG_AUX)) ==
+ (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+ seq_puts(m, ", ");
+ if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+ seq_puts(m, "AUX_DATA");
+ seq_puts(m, "\n");
+ return 0;
+ }
+
+ if ((unsigned long) v == 2) {
+ seq_puts(m, "======== ======== ==== ===== === === === == ====="
+ " == == = ="
+ " | ================ == == ================");
+ if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+ FSCACHE_OBJLIST_CONFIG_AUX))
+ seq_puts(m, " ================");
+ seq_puts(m, "\n");
+ return 0;
+ }
+
+ /* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no) \
+ do { \
+ unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \
+ unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \
+ if (criterion) { \
+ if (!(config & yes)) \
+ return 0; \
+ } else { \
+ if (!(config & no)) \
+ return 0; \
+ } \
+ } while(0)
+
+ if (~config) {
+ FILTER(obj->cookie,
+ COOKIE, NOCOOKIE);
+ FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+ obj->n_ops != 0 ||
+ obj->n_obj_ops != 0 ||
+ obj->flags ||
+ !list_empty(&obj->dependents),
+ BUSY, IDLE);
+ FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+ PENDWR, NOPENDWR);
+ FILTER(atomic_read(&obj->n_reads),
+ READS, NOREADS);
+ FILTER(obj->events & obj->event_mask,
+ EVENTS, NOEVENTS);
+ FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+ WORK, NOWORK);
+ }
+
+ seq_printf(m,
+ "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+ obj->debug_id,
+ obj->parent ? obj->parent->debug_id : -1,
+ fscache_object_states_short[obj->state],
+ obj->n_children,
+ obj->n_ops,
+ obj->n_obj_ops,
+ obj->n_in_progress,
+ obj->n_exclusive,
+ atomic_read(&obj->n_reads),
+ obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+ obj->events,
+ obj->flags,
+ obj->work.flags);
+
+ no_cookie = true;
+ keylen = auxlen = 0;
+ if (obj->cookie) {
+ spin_lock(&obj->lock);
+ if (obj->cookie) {
+ switch (obj->cookie->def->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ sprintf(_type, "%02u",
+ obj->cookie->def->type);
+ type = _type;
+ break;
+ }
+
+ seq_printf(m, "%-16s %s %2lx %16p",
+ obj->cookie->def->name,
+ type,
+ obj->cookie->flags,
+ obj->cookie->netfs_data);
+
+ if (obj->cookie->def->get_key &&
+ config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = obj->cookie->def->get_key(
+ obj->cookie->netfs_data,
+ buf, 400);
+
+ if (obj->cookie->def->get_aux &&
+ config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = obj->cookie->def->get_aux(
+ obj->cookie->netfs_data,
+ buf + keylen, 512 - keylen);
+
+ no_cookie = false;
+ }
+ spin_unlock(&obj->lock);
+
+ if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+ seq_printf(m, " ");
+ for (p = buf; keylen > 0; keylen--)
+ seq_printf(m, "%02x", *p++);
+ if (auxlen > 0) {
+ if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+ seq_printf(m, ", ");
+ for (; auxlen > 0; auxlen--)
+ seq_printf(m, "%02x", *p++);
+ }
+ }
+ }
+
+ if (no_cookie)
+ seq_printf(m, "<no_cookie>\n");
+ else
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations fscache_objlist_ops = {
+ .start = fscache_objlist_start,
+ .stop = fscache_objlist_stop,
+ .next = fscache_objlist_next,
+ .show = fscache_objlist_show,
+};
+
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+ struct user_key_payload *confkey;
+ unsigned long config;
+ struct key *key;
+ const char *buf;
+ int len;
+
+ key = request_key(&key_type_user, "fscache:objlist", NULL);
+ if (IS_ERR(key))
+ goto no_config;
+
+ config = 0;
+ rcu_read_lock();
+
+ confkey = key->payload.data;
+ buf = confkey->data;
+
+ for (len = confkey->datalen - 1; len >= 0; len--) {
+ switch (buf[len]) {
+ case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break;
+ case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break;
+ case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break;
+ case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break;
+ case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break;
+ case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break;
+ case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break;
+ case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break;
+ case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break;
+ case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break;
+ case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break;
+ case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break;
+ }
+ }
+
+ rcu_read_unlock();
+ key_put(key);
+
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+ config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+ config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+ config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+ config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+ config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+ if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+ config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+
+ data->config = config;
+ return;
+
+no_config:
+#endif
+ data->config = ULONG_MAX;
+}
+
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+ struct fscache_objlist_data *data;
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &fscache_objlist_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+
+ /* buffer for key extraction */
+ data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
+ if (!data) {
+ seq_release(inode, file);
+ return -ENOMEM;
+ }
+
+ /* get the configuration key */
+ fscache_objlist_config(data);
+
+ m->private = data;
+ return 0;
+}
+
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ kfree(m->private);
+ m->private = NULL;
+ return seq_release(inode, file);
+}
+
+const struct file_operations fscache_objlist_fops = {
+ .owner = THIS_MODULE,
+ .open = fscache_objlist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = fscache_objlist_release,
+};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79..e513ac599c8 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,9 +14,10 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
+#include <linux/seq_file.h>
#include "internal.h"
-const char *fscache_object_states[] = {
+const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
[FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
[FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
[FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
@@ -33,9 +34,28 @@ const char *fscache_object_states[] = {
};
EXPORT_SYMBOL(fscache_object_states);
+const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+ [FSCACHE_OBJECT_INIT] = "INIT",
+ [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
+ [FSCACHE_OBJECT_CREATING] = "CRTN",
+ [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
+ [FSCACHE_OBJECT_ACTIVE] = "ACTV",
+ [FSCACHE_OBJECT_UPDATING] = "UPDT",
+ [FSCACHE_OBJECT_DYING] = "DYNG",
+ [FSCACHE_OBJECT_LC_DYING] = "LCDY",
+ [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
+ [FSCACHE_OBJECT_RELEASING] = "RELS",
+ [FSCACHE_OBJECT_RECYCLING] = "RCYC",
+ [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
+ [FSCACHE_OBJECT_DEAD] = "DEAD",
+};
+
static void fscache_object_slow_work_put_ref(struct slow_work *);
static int fscache_object_slow_work_get_ref(struct slow_work *);
static void fscache_object_slow_work_execute(struct slow_work *);
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
+#endif
static void fscache_initialise_object(struct fscache_object *);
static void fscache_lookup_object(struct fscache_object *);
static void fscache_object_available(struct fscache_object *);
@@ -45,9 +65,13 @@ static void fscache_enqueue_dependents(struct fscache_object *);
static void fscache_dequeue_object(struct fscache_object *);
const struct slow_work_ops fscache_object_slow_work_ops = {
+ .owner = THIS_MODULE,
.get_ref = fscache_object_slow_work_get_ref,
.put_ref = fscache_object_slow_work_put_ref,
.execute = fscache_object_slow_work_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+ .desc = fscache_object_slow_work_desc,
+#endif
};
EXPORT_SYMBOL(fscache_object_slow_work_ops);
@@ -81,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
static void fscache_object_state_machine(struct fscache_object *object)
{
enum fscache_object_state new_state;
+ struct fscache_cookie *cookie;
ASSERT(object != NULL);
@@ -120,20 +145,31 @@ static void fscache_object_state_machine(struct fscache_object *object)
case FSCACHE_OBJECT_UPDATING:
clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
goto active_transit;
/* handle an object dying during lookup or creation */
case FSCACHE_OBJECT_LC_DYING:
object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+ fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
spin_lock(&object->lock);
object->state = FSCACHE_OBJECT_DYING;
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &object->cookie->flags))
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_CREATING);
+ cookie = object->cookie;
+ if (cookie) {
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
+ &cookie->flags))
+ wake_up_bit(&cookie->flags,
+ FSCACHE_COOKIE_LOOKING_UP);
+ if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+ &cookie->flags))
+ wake_up_bit(&cookie->flags,
+ FSCACHE_COOKIE_CREATING);
+ }
spin_unlock(&object->lock);
fscache_done_parent_op(object);
@@ -165,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
}
spin_unlock(&object->lock);
fscache_enqueue_dependents(object);
+ fscache_start_operations(object);
goto terminal_transit;
/* handle an abort during initialisation */
@@ -316,14 +353,29 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
_enter("{OBJ%x}", object->debug_id);
- clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-
start = jiffies;
fscache_object_state_machine(object);
fscache_hist(fscache_objs_histogram, start);
if (object->events & object->event_mask)
fscache_enqueue_object(object);
+ clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+}
+
+/*
+ * describe an object for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *work,
+ struct seq_file *m)
+{
+ struct fscache_object *object =
+ container_of(work, struct fscache_object, work);
+
+ seq_printf(m, "FSC: OBJ%x: %s",
+ object->debug_id,
+ fscache_object_states_short[object->state]);
}
+#endif
/*
* initialise an object
@@ -376,7 +428,9 @@ static void fscache_initialise_object(struct fscache_object *object)
* binding on to us, so we need to make sure we don't
* add ourself to the list multiple times */
if (list_empty(&object->dep_link)) {
+ fscache_stat(&fscache_n_cop_grab_object);
object->cache->ops->grab_object(object);
+ fscache_stat_d(&fscache_n_cop_grab_object);
list_add(&object->dep_link,
&parent->dependents);
@@ -414,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
struct fscache_object *parent;
+ int ret;
_enter("");
@@ -438,11 +493,20 @@ static void fscache_lookup_object(struct fscache_object *object)
object->cache->tag->name);
fscache_stat(&fscache_n_object_lookups);
- object->cache->ops->lookup_object(object);
+ fscache_stat(&fscache_n_cop_lookup_object);
+ ret = object->cache->ops->lookup_object(object);
+ fscache_stat_d(&fscache_n_cop_lookup_object);
if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ if (ret == -ETIMEDOUT) {
+ /* probably stuck behind another object, so move this one to
+ * the back of the queue */
+ fscache_stat(&fscache_n_object_lookups_timed_out);
+ set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ }
+
_leave("");
}
@@ -546,7 +610,8 @@ static void fscache_object_available(struct fscache_object *object)
spin_lock(&object->lock);
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+ if (object->cookie &&
+ test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
fscache_done_parent_op(object);
@@ -562,7 +627,9 @@ static void fscache_object_available(struct fscache_object *object)
}
spin_unlock(&object->lock);
+ fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
fscache_enqueue_dependents(object);
fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
@@ -581,11 +648,16 @@ static void fscache_drop_object(struct fscache_object *object)
_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+ ASSERTCMP(object->cookie, ==, NULL);
+ ASSERT(hlist_unhashed(&object->cookie_link));
+
spin_lock(&cache->object_list_lock);
list_del_init(&object->cache_link);
spin_unlock(&cache->object_list_lock);
+ fscache_stat(&fscache_n_cop_drop_object);
cache->ops->drop_object(object);
+ fscache_stat_d(&fscache_n_cop_drop_object);
if (parent) {
_debug("release parent OBJ%x {%d}",
@@ -600,7 +672,9 @@ static void fscache_drop_object(struct fscache_object *object)
}
/* this just shifts the object release to the slow work processor */
+ fscache_stat(&fscache_n_cop_put_object);
object->cache->ops->put_object(object);
+ fscache_stat_d(&fscache_n_cop_put_object);
_leave("");
}
@@ -690,8 +764,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
+ int ret;
- return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+ fscache_stat(&fscache_n_cop_grab_object);
+ ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+ fscache_stat_d(&fscache_n_cop_grab_object);
+ return ret;
}
/*
@@ -702,7 +780,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work)
struct fscache_object *object =
container_of(work, struct fscache_object, work);
- return object->cache->ops->put_object(object);
+ fscache_stat(&fscache_n_cop_put_object);
+ object->cache->ops->put_object(object);
+ fscache_stat_d(&fscache_n_cop_put_object);
}
/*
@@ -739,7 +819,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
/* sort onto appropriate lists */
fscache_enqueue_object(dep);
+ fscache_stat(&fscache_n_cop_put_object);
dep->cache->ops->put_object(dep);
+ fscache_stat_d(&fscache_n_cop_put_object);
if (!list_empty(&object->dependents))
cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6..313e79a1426 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -13,6 +13,7 @@
#define FSCACHE_DEBUG_LEVEL OPERATION
#include <linux/module.h>
+#include <linux/seq_file.h>
#include "internal.h"
atomic_t fscache_op_debug_id;
@@ -31,32 +32,33 @@ void fscache_enqueue_operation(struct fscache_operation *op)
_enter("{OBJ%x OP%x,%u}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+ fscache_set_op_state(op, "EnQ");
+
+ ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
ASSERTCMP(atomic_read(&op->usage), >, 0);
- if (list_empty(&op->pend_link)) {
- switch (op->flags & FSCACHE_OP_TYPE) {
- case FSCACHE_OP_FAST:
- _debug("queue fast");
- atomic_inc(&op->usage);
- if (!schedule_work(&op->fast_work))
- fscache_put_operation(op);
- break;
- case FSCACHE_OP_SLOW:
- _debug("queue slow");
- slow_work_enqueue(&op->slow_work);
- break;
- case FSCACHE_OP_MYTHREAD:
- _debug("queue for caller's attention");
- break;
- default:
- printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
- op->flags);
- BUG();
- break;
- }
- fscache_stat(&fscache_n_op_enqueue);
+ fscache_stat(&fscache_n_op_enqueue);
+ switch (op->flags & FSCACHE_OP_TYPE) {
+ case FSCACHE_OP_FAST:
+ _debug("queue fast");
+ atomic_inc(&op->usage);
+ if (!schedule_work(&op->fast_work))
+ fscache_put_operation(op);
+ break;
+ case FSCACHE_OP_SLOW:
+ _debug("queue slow");
+ slow_work_enqueue(&op->slow_work);
+ break;
+ case FSCACHE_OP_MYTHREAD:
+ _debug("queue for caller's attention");
+ break;
+ default:
+ printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+ op->flags);
+ BUG();
+ break;
}
}
EXPORT_SYMBOL(fscache_enqueue_operation);
@@ -67,6 +69,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
static void fscache_run_op(struct fscache_object *object,
struct fscache_operation *op)
{
+ fscache_set_op_state(op, "Run");
+
object->n_in_progress++;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -87,9 +91,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+ fscache_set_op_state(op, "SubmitX");
+
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+ ASSERT(list_empty(&op->pend_link));
ret = -ENOBUFS;
if (fscache_object_is_active(object)) {
@@ -190,9 +197,12 @@ int fscache_submit_op(struct fscache_object *object,
ASSERTCMP(atomic_read(&op->usage), >, 0);
+ fscache_set_op_state(op, "Submit");
+
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+ ASSERT(list_empty(&op->pend_link));
ostate = object->state;
smp_rmb();
@@ -222,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object,
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
+ } else if (object->state == FSCACHE_OBJECT_DYING ||
+ object->state == FSCACHE_OBJECT_LC_DYING ||
+ object->state == FSCACHE_OBJECT_WITHDRAWING) {
+ fscache_stat(&fscache_n_op_rejected);
+ ret = -ENOBUFS;
} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
fscache_report_unexpected_submission(object, op, ostate);
ASSERT(!fscache_object_is_active(object));
@@ -264,12 +279,7 @@ void fscache_start_operations(struct fscache_object *object)
stop = true;
}
list_del_init(&op->pend_link);
- object->n_in_progress++;
-
- if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
- wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- if (op->processor)
- fscache_enqueue_operation(op);
+ fscache_run_op(object, op);
/* the pending queue was holding a ref on the object */
fscache_put_operation(op);
@@ -282,6 +292,36 @@ void fscache_start_operations(struct fscache_object *object)
}
/*
+ * cancel an operation that's pending on an object
+ */
+int fscache_cancel_op(struct fscache_operation *op)
+{
+ struct fscache_object *object = op->object;
+ int ret;
+
+ _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+
+ spin_lock(&object->lock);
+
+ ret = -EBUSY;
+ if (!list_empty(&op->pend_link)) {
+ fscache_stat(&fscache_n_op_cancelled);
+ list_del_init(&op->pend_link);
+ object->n_ops--;
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ fscache_put_operation(op);
+ ret = 0;
+ }
+
+ spin_unlock(&object->lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
* release an operation
* - queues pending ops if this is the last in-progress op
*/
@@ -298,6 +338,8 @@ void fscache_put_operation(struct fscache_operation *op)
if (!atomic_dec_and_test(&op->usage))
return;
+ fscache_set_op_state(op, "Put");
+
_debug("PUT OP");
if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
BUG();
@@ -311,6 +353,9 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
* lock, and defer it otherwise */
@@ -452,8 +497,27 @@ static void fscache_op_execute(struct slow_work *work)
_leave("");
}
+/*
+ * describe an operation for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
+{
+ struct fscache_operation *op =
+ container_of(work, struct fscache_operation, slow_work);
+
+ seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
+ op->object->debug_id, op->debug_id,
+ op->name, op->state, op->flags);
+}
+#endif
+
const struct slow_work_ops fscache_op_slow_work_ops = {
+ .owner = THIS_MODULE,
.get_ref = fscache_op_get_ref,
.put_ref = fscache_op_put_ref,
.execute = fscache_op_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+ .desc = fscache_op_desc,
+#endif
};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 2568e0eb644..c598ea4c4e7 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,18 +43,102 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
EXPORT_SYMBOL(__fscache_wait_on_page_write);
/*
- * note that a page has finished being written to the cache
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
*/
-static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp)
{
struct page *xpage;
+ void *val;
+
+ _enter("%p,%p,%x", cookie, page, gfp);
+
+ rcu_read_lock();
+ val = radix_tree_lookup(&cookie->stores, page->index);
+ if (!val) {
+ rcu_read_unlock();
+ fscache_stat(&fscache_n_store_vmscan_not_storing);
+ __fscache_uncache_page(cookie, page);
+ return true;
+ }
+
+ /* see if the page is actually undergoing storage - if so we can't get
+ * rid of it till the cache has finished with it */
+ if (radix_tree_tag_get(&cookie->stores, page->index,
+ FSCACHE_COOKIE_STORING_TAG)) {
+ rcu_read_unlock();
+ goto page_busy;
+ }
+
+ /* the page is pending storage, so we attempt to cancel the store and
+ * discard the store request so that the page can be reclaimed */
+ spin_lock(&cookie->stores_lock);
+ rcu_read_unlock();
+
+ if (radix_tree_tag_get(&cookie->stores, page->index,
+ FSCACHE_COOKIE_STORING_TAG)) {
+ /* the page started to undergo storage whilst we were looking,
+ * so now we can only wait or return */
+ spin_unlock(&cookie->stores_lock);
+ goto page_busy;
+ }
- spin_lock(&cookie->lock);
xpage = radix_tree_delete(&cookie->stores, page->index);
- spin_unlock(&cookie->lock);
- ASSERT(xpage != NULL);
+ spin_unlock(&cookie->stores_lock);
+
+ if (xpage) {
+ fscache_stat(&fscache_n_store_vmscan_cancelled);
+ fscache_stat(&fscache_n_store_radix_deletes);
+ ASSERTCMP(xpage, ==, page);
+ } else {
+ fscache_stat(&fscache_n_store_vmscan_gone);
+ }
wake_up_bit(&cookie->flags, 0);
+ if (xpage)
+ page_cache_release(xpage);
+ __fscache_uncache_page(cookie, page);
+ return true;
+
+page_busy:
+ /* we might want to wait here, but that could deadlock the allocator as
+ * the slow-work threads writing to the cache may all end up sleeping
+ * on memory allocation */
+ fscache_stat(&fscache_n_store_vmscan_busy);
+ return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_object *object,
+ struct page *page)
+{
+ struct fscache_cookie *cookie;
+ struct page *xpage = NULL;
+
+ spin_lock(&object->lock);
+ cookie = object->cookie;
+ if (cookie) {
+ /* delete the page from the tree if it is now no longer
+ * pending */
+ spin_lock(&cookie->stores_lock);
+ radix_tree_tag_clear(&cookie->stores, page->index,
+ FSCACHE_COOKIE_STORING_TAG);
+ if (!radix_tree_tag_get(&cookie->stores, page->index,
+ FSCACHE_COOKIE_PENDING_TAG)) {
+ fscache_stat(&fscache_n_store_radix_deletes);
+ xpage = radix_tree_delete(&cookie->stores, page->index);
+ }
+ spin_unlock(&cookie->stores_lock);
+ wake_up_bit(&cookie->flags, 0);
+ }
+ spin_unlock(&object->lock);
+ if (xpage)
+ page_cache_release(xpage);
}
/*
@@ -63,14 +147,21 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p
static void fscache_attr_changed_op(struct fscache_operation *op)
{
struct fscache_object *object = op->object;
+ int ret;
_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object) &&
- object->cache->ops->attr_changed(object) < 0)
- fscache_abort_object(object);
+ if (fscache_object_is_active(object)) {
+ fscache_set_op_state(op, "CallFS");
+ fscache_stat(&fscache_n_cop_attr_changed);
+ ret = object->cache->ops->attr_changed(object);
+ fscache_stat_d(&fscache_n_cop_attr_changed);
+ fscache_set_op_state(op, "Done");
+ if (ret < 0)
+ fscache_abort_object(object);
+ }
_leave("");
}
@@ -99,6 +190,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
fscache_operation_init(op, NULL);
fscache_operation_init_slow(op, fscache_attr_changed_op);
op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+ fscache_set_op_name(op, "Attr");
spin_lock(&cookie->lock);
@@ -184,6 +276,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
op->start_time = jiffies;
INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
INIT_LIST_HEAD(&op->to_do);
+ fscache_set_op_name(&op->op, "Retr");
return op;
}
@@ -221,6 +314,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
}
/*
+ * wait for an object to become active (or dead)
+ */
+static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
+ struct fscache_retrieval *op,
+ atomic_t *stat_op_waits,
+ atomic_t *stat_object_dead)
+{
+ int ret;
+
+ if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+ goto check_if_dead;
+
+ _debug(">>> WT");
+ fscache_stat(stat_op_waits);
+ if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit_interruptible,
+ TASK_INTERRUPTIBLE) < 0) {
+ ret = fscache_cancel_op(&op->op);
+ if (ret == 0)
+ return -ERESTARTSYS;
+
+ /* it's been removed from the pending queue by another party,
+ * so we should get to run shortly */
+ wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ }
+ _debug("<<< GO");
+
+check_if_dead:
+ if (unlikely(fscache_object_is_dead(object))) {
+ fscache_stat(stat_object_dead);
+ return -ENOBUFS;
+ }
+ return 0;
+}
+
+/*
* read a page from the cache or allocate a block in which to store it
* - we return:
* -ENOMEM - out of memory, nothing done
@@ -257,6 +387,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
_leave(" = -ENOMEM");
return -ENOMEM;
}
+ fscache_set_op_name(&op->op, "RetrRA1");
spin_lock(&cookie->lock);
@@ -267,6 +398,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+ atomic_inc(&object->n_reads);
+ set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
if (fscache_submit_op(object, &op->op) < 0)
goto nobufs_unlock;
spin_unlock(&cookie->lock);
@@ -279,23 +413,27 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
- if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
- _debug(">>> WT");
- fscache_stat(&fscache_n_retrieval_op_waits);
- wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- _debug("<<< GO");
- }
+ ret = fscache_wait_for_retrieval_activation(
+ object, op,
+ __fscache_stat(&fscache_n_retrieval_op_waits),
+ __fscache_stat(&fscache_n_retrievals_object_dead));
+ if (ret < 0)
+ goto error;
/* ask the cache to honour the operation */
if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+ fscache_stat(&fscache_n_cop_allocate_page);
ret = object->cache->ops->allocate_page(op, page, gfp);
+ fscache_stat_d(&fscache_n_cop_allocate_page);
if (ret == 0)
ret = -ENODATA;
} else {
+ fscache_stat(&fscache_n_cop_read_or_alloc_page);
ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+ fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
}
+error:
if (ret == -ENOMEM)
fscache_stat(&fscache_n_retrievals_nomem);
else if (ret == -ERESTARTSYS)
@@ -347,7 +485,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
void *context,
gfp_t gfp)
{
- fscache_pages_retrieval_func_t func;
struct fscache_retrieval *op;
struct fscache_object *object;
int ret;
@@ -369,6 +506,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(mapping, end_io_func, context);
if (!op)
return -ENOMEM;
+ fscache_set_op_name(&op->op, "RetrRAN");
spin_lock(&cookie->lock);
@@ -377,6 +515,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
+ atomic_inc(&object->n_reads);
+ set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
if (fscache_submit_op(object, &op->op) < 0)
goto nobufs_unlock;
spin_unlock(&cookie->lock);
@@ -389,21 +530,27 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
- if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
- _debug(">>> WT");
- fscache_stat(&fscache_n_retrieval_op_waits);
- wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- _debug("<<< GO");
- }
+ ret = fscache_wait_for_retrieval_activation(
+ object, op,
+ __fscache_stat(&fscache_n_retrieval_op_waits),
+ __fscache_stat(&fscache_n_retrievals_object_dead));
+ if (ret < 0)
+ goto error;
/* ask the cache to honour the operation */
- if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
- func = object->cache->ops->allocate_pages;
- else
- func = object->cache->ops->read_or_alloc_pages;
- ret = func(op, pages, nr_pages, gfp);
+ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+ fscache_stat(&fscache_n_cop_allocate_pages);
+ ret = object->cache->ops->allocate_pages(
+ op, pages, nr_pages, gfp);
+ fscache_stat_d(&fscache_n_cop_allocate_pages);
+ } else {
+ fscache_stat(&fscache_n_cop_read_or_alloc_pages);
+ ret = object->cache->ops->read_or_alloc_pages(
+ op, pages, nr_pages, gfp);
+ fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
+ }
+error:
if (ret == -ENOMEM)
fscache_stat(&fscache_n_retrievals_nomem);
else if (ret == -ERESTARTSYS)
@@ -461,6 +608,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
+ fscache_set_op_name(&op->op, "RetrAL1");
spin_lock(&cookie->lock);
@@ -475,18 +623,22 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_alloc_ops);
- if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
- _debug(">>> WT");
- fscache_stat(&fscache_n_alloc_op_waits);
- wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- _debug("<<< GO");
- }
+ ret = fscache_wait_for_retrieval_activation(
+ object, op,
+ __fscache_stat(&fscache_n_alloc_op_waits),
+ __fscache_stat(&fscache_n_allocs_object_dead));
+ if (ret < 0)
+ goto error;
/* ask the cache to honour the operation */
+ fscache_stat(&fscache_n_cop_allocate_page);
ret = object->cache->ops->allocate_page(op, page, gfp);
+ fscache_stat_d(&fscache_n_cop_allocate_page);
- if (ret < 0)
+error:
+ if (ret == -ERESTARTSYS)
+ fscache_stat(&fscache_n_allocs_intr);
+ else if (ret < 0)
fscache_stat(&fscache_n_allocs_nobufs);
else
fscache_stat(&fscache_n_allocs_ok);
@@ -521,7 +673,7 @@ static void fscache_write_op(struct fscache_operation *_op)
struct fscache_storage *op =
container_of(_op, struct fscache_storage, op);
struct fscache_object *object = op->op.object;
- struct fscache_cookie *cookie = object->cookie;
+ struct fscache_cookie *cookie;
struct page *page;
unsigned n;
void *results[1];
@@ -529,16 +681,19 @@ static void fscache_write_op(struct fscache_operation *_op)
_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
- spin_lock(&cookie->lock);
+ fscache_set_op_state(&op->op, "GetPage");
+
spin_lock(&object->lock);
+ cookie = object->cookie;
- if (!fscache_object_is_active(object)) {
+ if (!fscache_object_is_active(object) || !cookie) {
spin_unlock(&object->lock);
- spin_unlock(&cookie->lock);
_leave("");
return;
}
+ spin_lock(&cookie->stores_lock);
+
fscache_stat(&fscache_n_store_calls);
/* find a page to store */
@@ -549,23 +704,35 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index > op->store_limit)
+ if (page->index > op->store_limit) {
+ fscache_stat(&fscache_n_store_pages_over_limit);
goto superseded;
+ }
- radix_tree_tag_clear(&cookie->stores, page->index,
- FSCACHE_COOKIE_PENDING_TAG);
+ if (page) {
+ radix_tree_tag_set(&cookie->stores, page->index,
+ FSCACHE_COOKIE_STORING_TAG);
+ radix_tree_tag_clear(&cookie->stores, page->index,
+ FSCACHE_COOKIE_PENDING_TAG);
+ }
+ spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
- spin_unlock(&cookie->lock);
if (page) {
+ fscache_set_op_state(&op->op, "Store");
+ fscache_stat(&fscache_n_store_pages);
+ fscache_stat(&fscache_n_cop_write_page);
ret = object->cache->ops->write_page(op, page);
- fscache_end_page_write(cookie, page);
- page_cache_release(page);
- if (ret < 0)
+ fscache_stat_d(&fscache_n_cop_write_page);
+ fscache_set_op_state(&op->op, "EndWrite");
+ fscache_end_page_write(object, page);
+ if (ret < 0) {
+ fscache_set_op_state(&op->op, "Abort");
fscache_abort_object(object);
- else
+ } else {
fscache_enqueue_operation(&op->op);
+ }
}
_leave("");
@@ -575,9 +742,9 @@ superseded:
/* this writer is going away and there aren't any more things to
* write */
_debug("cease");
+ spin_unlock(&cookie->stores_lock);
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
spin_unlock(&object->lock);
- spin_unlock(&cookie->lock);
_leave("");
}
@@ -634,6 +801,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_release_write_op);
fscache_operation_init_slow(&op->op, fscache_write_op);
op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+ fscache_set_op_name(&op->op, "Write1");
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
@@ -652,6 +820,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
/* add the page to the pending-storage radix tree on the backing
* object */
spin_lock(&object->lock);
+ spin_lock(&cookie->stores_lock);
_debug("store limit %llx", (unsigned long long) object->store_limit);
@@ -672,6 +841,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
goto already_pending;
+ spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
@@ -693,6 +863,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
already_queued:
fscache_stat(&fscache_n_stores_again);
already_pending:
+ spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
spin_unlock(&cookie->lock);
radix_tree_preload_end();
@@ -702,7 +873,9 @@ already_pending:
return 0;
submit_failed:
+ spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
+ spin_unlock(&cookie->stores_lock);
page_cache_release(page);
ret = -ENOBUFS;
goto nobufs;
@@ -763,7 +936,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
if (TestClearPageFsCache(page) &&
object->cache->ops->uncache_page) {
/* the cache backend releases the cookie lock */
+ fscache_stat(&fscache_n_cop_uncache_page);
object->cache->ops->uncache_page(object, page);
+ fscache_stat_d(&fscache_n_cop_uncache_page);
goto done;
}
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index beeab44bc31..1d9e4951a59 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -37,10 +37,20 @@ int __init fscache_proc_init(void)
goto error_histogram;
#endif
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+ if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+ &fscache_objlist_fops))
+ goto error_objects;
+#endif
+
_leave(" = 0");
return 0;
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
#ifdef CONFIG_FSCACHE_HISTOGRAM
+ remove_proc_entry("fs/fscache/histogram", NULL);
error_histogram:
#endif
#ifdef CONFIG_FSCACHE_STATS
@@ -58,6 +68,9 @@ error_dir:
*/
void fscache_proc_cleanup(void)
{
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+ remove_proc_entry("fs/fscache/objects", NULL);
+#endif
#ifdef CONFIG_FSCACHE_HISTOGRAM
remove_proc_entry("fs/fscache/histogram", NULL);
#endif
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 65deb99e756..46435f3aae6 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -25,6 +25,8 @@ atomic_t fscache_n_op_requeue;
atomic_t fscache_n_op_deferred_release;
atomic_t fscache_n_op_release;
atomic_t fscache_n_op_gc;
+atomic_t fscache_n_op_cancelled;
+atomic_t fscache_n_op_rejected;
atomic_t fscache_n_attr_changed;
atomic_t fscache_n_attr_changed_ok;
@@ -36,6 +38,8 @@ atomic_t fscache_n_allocs;
atomic_t fscache_n_allocs_ok;
atomic_t fscache_n_allocs_wait;
atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
atomic_t fscache_n_alloc_ops;
atomic_t fscache_n_alloc_op_waits;
@@ -46,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata;
atomic_t fscache_n_retrievals_nobufs;
atomic_t fscache_n_retrievals_intr;
atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
atomic_t fscache_n_retrieval_ops;
atomic_t fscache_n_retrieval_op_waits;
@@ -56,6 +61,14 @@ atomic_t fscache_n_stores_nobufs;
atomic_t fscache_n_stores_oom;
atomic_t fscache_n_store_ops;
atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
+
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
atomic_t fscache_n_marks;
atomic_t fscache_n_uncaches;
@@ -74,6 +87,7 @@ atomic_t fscache_n_updates_run;
atomic_t fscache_n_relinquishes;
atomic_t fscache_n_relinquishes_null;
atomic_t fscache_n_relinquishes_waitcrt;
+atomic_t fscache_n_relinquishes_retire;
atomic_t fscache_n_cookie_index;
atomic_t fscache_n_cookie_data;
@@ -84,6 +98,7 @@ atomic_t fscache_n_object_no_alloc;
atomic_t fscache_n_object_lookups;
atomic_t fscache_n_object_lookups_negative;
atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
atomic_t fscache_n_object_created;
atomic_t fscache_n_object_avail;
atomic_t fscache_n_object_dead;
@@ -93,6 +108,23 @@ atomic_t fscache_n_checkaux_okay;
atomic_t fscache_n_checkaux_update;
atomic_t fscache_n_checkaux_obsolete;
+atomic_t fscache_n_cop_alloc_object;
+atomic_t fscache_n_cop_lookup_object;
+atomic_t fscache_n_cop_lookup_complete;
+atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_update_object;
+atomic_t fscache_n_cop_drop_object;
+atomic_t fscache_n_cop_put_object;
+atomic_t fscache_n_cop_sync_cache;
+atomic_t fscache_n_cop_attr_changed;
+atomic_t fscache_n_cop_read_or_alloc_page;
+atomic_t fscache_n_cop_read_or_alloc_pages;
+atomic_t fscache_n_cop_allocate_page;
+atomic_t fscache_n_cop_allocate_pages;
+atomic_t fscache_n_cop_write_page;
+atomic_t fscache_n_cop_uncache_page;
+atomic_t fscache_n_cop_dissociate_pages;
+
/*
* display the general statistics
*/
@@ -129,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_acquires_nobufs),
atomic_read(&fscache_n_acquires_oom));
- seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+ seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
atomic_read(&fscache_n_object_lookups),
atomic_read(&fscache_n_object_lookups_negative),
atomic_read(&fscache_n_object_lookups_positive),
+ atomic_read(&fscache_n_object_lookups_timed_out),
atomic_read(&fscache_n_object_created));
seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
@@ -140,10 +173,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_updates_null),
atomic_read(&fscache_n_updates_run));
- seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+ seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
atomic_read(&fscache_n_relinquishes),
atomic_read(&fscache_n_relinquishes_null),
- atomic_read(&fscache_n_relinquishes_waitcrt));
+ atomic_read(&fscache_n_relinquishes_waitcrt),
+ atomic_read(&fscache_n_relinquishes_retire));
seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
atomic_read(&fscache_n_attr_changed),
@@ -152,14 +186,16 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_attr_changed_nomem),
atomic_read(&fscache_n_attr_changed_calls));
- seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+ seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
atomic_read(&fscache_n_allocs),
atomic_read(&fscache_n_allocs_ok),
atomic_read(&fscache_n_allocs_wait),
- atomic_read(&fscache_n_allocs_nobufs));
- seq_printf(m, "Allocs : ops=%u owt=%u\n",
+ atomic_read(&fscache_n_allocs_nobufs),
+ atomic_read(&fscache_n_allocs_intr));
+ seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
atomic_read(&fscache_n_alloc_ops),
- atomic_read(&fscache_n_alloc_op_waits));
+ atomic_read(&fscache_n_alloc_op_waits),
+ atomic_read(&fscache_n_allocs_object_dead));
seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
" int=%u oom=%u\n",
@@ -170,9 +206,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_retrievals_nobufs),
atomic_read(&fscache_n_retrievals_intr),
atomic_read(&fscache_n_retrievals_nomem));
- seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+ seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
atomic_read(&fscache_n_retrieval_ops),
- atomic_read(&fscache_n_retrieval_op_waits));
+ atomic_read(&fscache_n_retrieval_op_waits),
+ atomic_read(&fscache_n_retrievals_object_dead));
seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
atomic_read(&fscache_n_stores),
@@ -180,18 +217,49 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_stores_again),
atomic_read(&fscache_n_stores_nobufs),
atomic_read(&fscache_n_stores_oom));
- seq_printf(m, "Stores : ops=%u run=%u\n",
+ seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
atomic_read(&fscache_n_store_ops),
- atomic_read(&fscache_n_store_calls));
+ atomic_read(&fscache_n_store_calls),
+ atomic_read(&fscache_n_store_pages),
+ atomic_read(&fscache_n_store_radix_deletes),
+ atomic_read(&fscache_n_store_pages_over_limit));
- seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
+ seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+ atomic_read(&fscache_n_store_vmscan_not_storing),
+ atomic_read(&fscache_n_store_vmscan_gone),
+ atomic_read(&fscache_n_store_vmscan_busy),
+ atomic_read(&fscache_n_store_vmscan_cancelled));
+
+ seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
atomic_read(&fscache_n_op_pend),
atomic_read(&fscache_n_op_run),
- atomic_read(&fscache_n_op_enqueue));
+ atomic_read(&fscache_n_op_enqueue),
+ atomic_read(&fscache_n_op_cancelled),
+ atomic_read(&fscache_n_op_rejected));
seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
atomic_read(&fscache_n_op_deferred_release),
atomic_read(&fscache_n_op_release),
atomic_read(&fscache_n_op_gc));
+
+ seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
+ atomic_read(&fscache_n_cop_alloc_object),
+ atomic_read(&fscache_n_cop_lookup_object),
+ atomic_read(&fscache_n_cop_lookup_complete),
+ atomic_read(&fscache_n_cop_grab_object));
+ seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+ atomic_read(&fscache_n_cop_update_object),
+ atomic_read(&fscache_n_cop_drop_object),
+ atomic_read(&fscache_n_cop_put_object),
+ atomic_read(&fscache_n_cop_attr_changed),
+ atomic_read(&fscache_n_cop_sync_cache));
+ seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
+ atomic_read(&fscache_n_cop_read_or_alloc_page),
+ atomic_read(&fscache_n_cop_read_or_alloc_pages),
+ atomic_read(&fscache_n_cop_allocate_page),
+ atomic_read(&fscache_n_cop_allocate_pages),
+ atomic_read(&fscache_n_cop_write_page),
+ atomic_read(&fscache_n_cop_uncache_page),
+ atomic_read(&fscache_n_cop_dissociate_pages));
return 0;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 992f6c9410b..4787ae6c5c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
if (fc->no_create)
return -ENOSYS;
+ if (flags & O_DIRECT)
+ return -EINVAL;
+
forget_req = fuse_get_req(fc);
if (IS_ERR(forget_req))
return PTR_ERR(forget_req);
@@ -712,8 +715,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (newent->d_inode)
+ if (newent->d_inode) {
+ fuse_invalidate_attr(newent->d_inode);
fuse_invalidate_entry_cache(newent);
+ }
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a3492f7d207..c18913a777a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
break;
}
}
- fuse_put_request(fc, req);
+ if (!IS_ERR(req))
+ fuse_put_request(fc, req);
if (res > 0)
*ppos = pos;
@@ -1599,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
kaddr += copy;
}
- kunmap(map);
+ kunmap(page);
}
return 0;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5971359d209..4dcddf83326 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,6 +8,8 @@ config GFS2_FS
select FS_POSIX_ACL
select CRC32
select SLOW_WORK
+ select QUOTA
+ select QUOTACTL
help
A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3fc4e3ac7d8..3eb1ea84617 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -12,6 +12,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/gfs2_ondisk.h>
@@ -26,108 +27,44 @@
#include "trans.h"
#include "util.h"
-#define ACL_ACCESS 1
-#define ACL_DEFAULT 0
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er, int *remove, mode_t *mode)
+static const char *gfs2_acl_name(int type)
{
- struct posix_acl *acl;
- int error;
-
- error = gfs2_acl_validate_remove(ip, access);
- if (error)
- return error;
-
- if (!er->er_data)
- return -EINVAL;
-
- acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl) {
- *remove = 1;
- return 0;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out;
-
- if (access) {
- error = posix_acl_equiv_mode(acl, mode);
- if (!error)
- *remove = 1;
- else if (error > 0)
- error = 0;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ return GFS2_POSIX_ACL_ACCESS;
+ case ACL_TYPE_DEFAULT:
+ return GFS2_POSIX_ACL_DEFAULT;
}
-
-out:
- posix_acl_release(acl);
- return error;
-}
-
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
-{
- if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
- return -EOPNOTSUPP;
- if (!is_owner_or_cap(&ip->i_inode))
- return -EPERM;
- if (S_ISLNK(ip->i_inode.i_mode))
- return -EOPNOTSUPP;
- if (!access && !S_ISDIR(ip->i_inode.i_mode))
- return -EACCES;
-
- return 0;
+ return NULL;
}
-static int acl_get(struct gfs2_inode *ip, const char *name,
- struct posix_acl **acl, struct gfs2_ea_location *el,
- char **datap, unsigned int *lenp)
+static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
{
+ struct posix_acl *acl;
+ const char *name;
char *data;
- unsigned int len;
- int error;
-
- el->el_bh = NULL;
+ int len;
if (!ip->i_eattr)
- return 0;
-
- error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
- if (error)
- return error;
- if (!el->el_ea)
- return 0;
- if (!GFS2_EA_DATA_LEN(el->el_ea))
- goto out;
+ return NULL;
- len = GFS2_EA_DATA_LEN(el->el_ea);
- data = kmalloc(len, GFP_NOFS);
- error = -ENOMEM;
- if (!data)
- goto out;
+ acl = get_cached_acl(&ip->i_inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
- error = gfs2_ea_get_copy(ip, el, data, len);
- if (error < 0)
- goto out_kfree;
- error = 0;
+ name = gfs2_acl_name(type);
+ if (name == NULL)
+ return ERR_PTR(-EINVAL);
- if (acl) {
- *acl = posix_acl_from_xattr(data, len);
- if (IS_ERR(*acl))
- error = PTR_ERR(*acl);
- }
+ len = gfs2_xattr_acl_get(ip, name, &data);
+ if (len < 0)
+ return ERR_PTR(len);
+ if (len == 0)
+ return NULL;
-out_kfree:
- if (error || !datap) {
- kfree(data);
- } else {
- *datap = data;
- *lenp = len;
- }
-out:
- return error;
+ acl = posix_acl_from_xattr(data, len);
+ kfree(data);
+ return acl;
}
/**
@@ -140,14 +77,12 @@ out:
int gfs2_check_acl(struct inode *inode, int mask)
{
- struct gfs2_ea_location el;
- struct posix_acl *acl = NULL;
+ struct posix_acl *acl;
int error;
- error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
- brelse(el.el_bh);
- if (error)
- return error;
+ acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (acl) {
error = posix_acl_permission(inode, acl, mask);
@@ -158,57 +93,75 @@ int gfs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, mode_t mode)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *dibh;
- int error;
+ int error = 0;
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_assert_withdraw(sdp,
- (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT));
- ip->i_inode.i_mode = mode;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ iattr.ia_valid = ATTR_MODE;
+ iattr.ia_mode = mode;
+
+ error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
}
- gfs2_trans_end(sdp);
+ return error;
+}
+
+static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+{
+ int error;
+ int len;
+ char *data;
+ const char *name = gfs2_acl_name(type);
- return 0;
+ BUG_ON(name == NULL);
+ len = posix_acl_to_xattr(acl, NULL, 0);
+ if (len == 0)
+ return 0;
+ data = kmalloc(len, GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+ error = posix_acl_to_xattr(acl, data, len);
+ if (error < 0)
+ goto out;
+ error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+out:
+ kfree(data);
+ return error;
}
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
{
- struct gfs2_ea_location el;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct posix_acl *acl = NULL, *clone;
- mode_t mode = ip->i_inode.i_mode;
- char *data = NULL;
- unsigned int len;
- int error;
+ struct posix_acl *acl, *clone;
+ mode_t mode = inode->i_mode;
+ int error = 0;
if (!sdp->sd_args.ar_posix_acl)
return 0;
- if (S_ISLNK(ip->i_inode.i_mode))
+ if (S_ISLNK(inode->i_mode))
return 0;
- error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
- brelse(el.el_bh);
- if (error)
- return error;
+ acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (!acl) {
mode &= ~current_umask();
- if (mode != ip->i_inode.i_mode)
- error = munge_mode(ip, mode);
+ if (mode != inode->i_mode)
+ error = gfs2_set_mode(inode, mode);
return error;
}
+ if (S_ISDIR(inode->i_mode)) {
+ error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto out;
+ }
+
clone = posix_acl_clone(acl, GFP_NOFS);
error = -ENOMEM;
if (!clone)
@@ -216,43 +169,32 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
posix_acl_release(acl);
acl = clone;
- if (S_ISDIR(ip->i_inode.i_mode)) {
- error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
- GFS2_POSIX_ACL_DEFAULT, data, len, 0);
- if (error)
- goto out;
- }
-
error = posix_acl_create_masq(acl, &mode);
if (error < 0)
goto out;
if (error == 0)
goto munge;
- posix_acl_to_xattr(acl, data, len);
- error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
- GFS2_POSIX_ACL_ACCESS, data, len, 0);
+ error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
if (error)
goto out;
munge:
- error = munge_mode(ip, mode);
+ error = gfs2_set_mode(inode, mode);
out:
posix_acl_release(acl);
- kfree(data);
return error;
}
int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
{
- struct posix_acl *acl = NULL, *clone;
- struct gfs2_ea_location el;
+ struct posix_acl *acl, *clone;
char *data;
unsigned int len;
int error;
- error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
- if (error)
- goto out_brelse;
+ acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (!acl)
return gfs2_setattr_simple(ip, attr);
@@ -265,15 +207,134 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
error = posix_acl_chmod_masq(acl, attr->ia_mode);
if (!error) {
+ len = posix_acl_to_xattr(acl, NULL, 0);
+ data = kmalloc(len, GFP_NOFS);
+ error = -ENOMEM;
+ if (data == NULL)
+ goto out;
posix_acl_to_xattr(acl, data, len);
- error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+ error = gfs2_xattr_acl_chmod(ip, attr, data);
+ kfree(data);
+ set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
}
out:
posix_acl_release(acl);
- kfree(data);
-out_brelse:
- brelse(el.el_bh);
return error;
}
+static int gfs2_acl_type(const char *name)
+{
+ if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
+ return ACL_TYPE_ACCESS;
+ if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
+ return ACL_TYPE_DEFAULT;
+ return -EINVAL;
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type;
+ int error;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+
+ acl = gfs2_acl_get(GFS2_I(inode), type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+
+ error = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct posix_acl *acl = NULL;
+ int error = 0, type;
+
+ if (!sdp->sd_args.ar_posix_acl)
+ return -EOPNOTSUPP;
+
+ type = gfs2_acl_type(name);
+ if (type < 0)
+ return type;
+ if (flags & XATTR_CREATE)
+ return -EINVAL;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return value ? -EACCES : 0;
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!value)
+ goto set_acl;
+
+ acl = posix_acl_from_xattr(value, size);
+ if (!acl) {
+ /*
+ * acl_set_file(3) may request that we set default ACLs with
+ * zero length -- defend (gracefully) against that here.
+ */
+ goto out;
+ }
+ if (IS_ERR(acl)) {
+ error = PTR_ERR(acl);
+ goto out;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out_release;
+
+ error = -EINVAL;
+ if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+ goto out_release;
+
+ if (type == ACL_TYPE_ACCESS) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+
+ if (error <= 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+
+ if (error < 0)
+ return error;
+ }
+
+ error = gfs2_set_mode(inode, mode);
+ if (error)
+ goto out_release;
+ }
+
+set_acl:
+ error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+ if (!error) {
+ if (acl)
+ set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
+ }
+out_release:
+ posix_acl_release(acl);
+out:
+ return error;
+}
+
+struct xattr_handler gfs2_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = gfs2_xattr_system_get,
+ .set = gfs2_xattr_system_set,
+};
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 6751930bfb6..9306a2e6620 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,26 +13,12 @@
#include "incore.h"
#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_ACCESS_LEN 16
#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
-#define GFS2_POSIX_ACL_DEFAULT_LEN 17
+#define GFS2_ACL_MAX_ENTRIES 25
-#define GFS2_ACL_IS_ACCESS(name, len) \
- ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
- !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
-
-#define GFS2_ACL_IS_DEFAULT(name, len) \
- ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
- !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
-
-struct gfs2_ea_request;
-
-int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode);
-int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
-int gfs2_check_acl(struct inode *inode, int mask);
-int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
+extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+extern struct xattr_handler gfs2_xattr_system_handler;
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 694b5d48f03..7b8da941526 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -269,7 +269,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int i;
int ret;
@@ -313,11 +312,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
if (ret || (--(wbc->nr_to_write) <= 0))
ret = 1;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- ret = 1;
- }
-
}
gfs2_trans_end(sdp);
return ret;
@@ -338,7 +332,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
static int gfs2_write_cache_jdata(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
struct pagevec pvec;
@@ -348,11 +341,6 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
int scanned = 0;
int range_whole = 0;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- return 0;
- }
-
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
@@ -819,8 +807,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
mark_inode_dirty(inode);
}
- if (inode == sdp->sd_rindex)
+ if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
+ ip->i_gh.gh_flags |= GL_NOCACHE;
+ }
brelse(dibh);
gfs2_trans_end(sdp);
@@ -889,8 +879,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
mark_inode_dirty(inode);
}
- if (inode == sdp->sd_rindex)
+ if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
+ ip->i_gh.gh_flags |= GL_NOCACHE;
+ }
brelse(dibh);
gfs2_trans_end(sdp);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 297d7e5ceba..25fddc100f1 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -525,38 +525,6 @@ consist_inode:
return ERR_PTR(-EIO);
}
-
-/**
- * dirent_first - Return the first dirent
- * @dip: the directory
- * @bh: The buffer
- * @dent: Pointer to list of dirents
- *
- * return first dirent whether bh points to leaf or stuffed dinode
- *
- * Returns: IS_LEAF, IS_DINODE, or -errno
- */
-
-static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
- struct gfs2_dirent **dent)
-{
- struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
-
- if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
- if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
- return -EIO;
- *dent = (struct gfs2_dirent *)(bh->b_data +
- sizeof(struct gfs2_leaf));
- return IS_LEAF;
- } else {
- if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
- return -EIO;
- *dent = (struct gfs2_dirent *)(bh->b_data +
- sizeof(struct gfs2_dinode));
- return IS_DINODE;
- }
-}
-
static int dirent_check_reclen(struct gfs2_inode *dip,
const struct gfs2_dirent *d, const void *end_p)
{
@@ -1006,7 +974,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
divider = (start + half_len) << (32 - dip->i_depth);
/* Copy the entries */
- dirent_first(dip, obh, &dent);
+ dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
do {
next = dent;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8b674b1f3a5..f455a03a09e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -241,15 +241,14 @@ int gfs2_glock_put(struct gfs2_glock *gl)
int rv = 0;
write_lock(gl_lock_addr(gl->gl_hash));
- if (atomic_dec_and_test(&gl->gl_ref)) {
+ if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
hlist_del(&gl->gl_list);
- write_unlock(gl_lock_addr(gl->gl_hash));
- spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
}
spin_unlock(&lru_lock);
+ write_unlock(gl_lock_addr(gl->gl_hash));
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
glock_free(gl);
rv = 1;
@@ -513,7 +512,6 @@ retry:
GLOCK_BUG_ON(gl, 1);
}
spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
return;
}
@@ -524,8 +522,6 @@ retry:
if (glops->go_xmote_bh) {
spin_unlock(&gl->gl_spin);
rv = glops->go_xmote_bh(gl, gh);
- if (rv == -EAGAIN)
- return;
spin_lock(&gl->gl_spin);
if (rv) {
do_error(gl, rv);
@@ -540,7 +536,6 @@ out:
clear_bit(GLF_LOCK, &gl->gl_flags);
out_locked:
spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
}
static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
@@ -600,7 +595,6 @@ __acquires(&gl->gl_spin)
if (!(ret & LM_OUT_ASYNC)) {
finish_xmote(gl, ret);
- gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
} else {
@@ -672,12 +666,17 @@ out:
return;
out_sched:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ smp_mb__after_clear_bit();
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put_nolock(gl);
+ return;
+
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
- goto out;
+ smp_mb__after_clear_bit();
+ return;
}
static void delete_work_func(struct work_struct *work)
@@ -707,9 +706,12 @@ static void glock_work_func(struct work_struct *work)
{
unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+ int drop_ref = 0;
- if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
finish_xmote(gl, gl->gl_reply);
+ drop_ref = 1;
+ }
down_read(&gfs2_umount_flush_sem);
spin_lock(&gl->gl_spin);
if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -727,6 +729,8 @@ static void glock_work_func(struct work_struct *work)
if (!delay ||
queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
+ if (drop_ref)
+ gfs2_glock_put(gl);
}
/**
@@ -1361,10 +1365,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
- /* Check if glock is about to be freed */
- if (atomic_read(&gl->gl_ref) == 0)
- continue;
-
/* Test for being demotable */
if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
gfs2_glock_hold(gl);
@@ -1375,10 +1375,11 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
handle_callback(gl, LM_ST_UNLOCKED, 0);
nr--;
}
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ smp_mb__after_clear_bit();
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put_nolock(gl);
spin_unlock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
spin_lock(&lru_lock);
continue;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c609894ec0d..13f0bd22813 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,15 +180,6 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
return gl->gl_state == LM_ST_SHARED;
}
-static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
-{
- int ret;
- spin_lock(&gl->gl_spin);
- ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- return ret;
-}
-
int gfs2_glock_get(struct gfs2_sbd *sdp,
u64 number, const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 6985eef06c3..78554acc060 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
+#include <linux/posix_acl.h>
#include "gfs2.h"
#include "incore.h"
@@ -184,8 +185,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (flags & DIO_METADATA) {
struct address_space *mapping = gl->gl_aspace->i_mapping;
truncate_inode_pages(mapping, 0);
- if (ip)
+ if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
+ forget_all_cached_acls(&ip->i_inode);
+ }
}
if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6edb423f90b..4792200978c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,7 +429,11 @@ struct gfs2_args {
unsigned int ar_meta:1; /* mount metafs */
unsigned int ar_discard:1; /* discard requests */
unsigned int ar_errors:2; /* errors=withdraw | panic */
+ unsigned int ar_nobarrier:1; /* do not send barriers */
int ar_commit; /* Commit interval */
+ int ar_statfs_quantum; /* The fast statfs interval */
+ int ar_quota_quantum; /* The quota interval */
+ int ar_statfs_percent; /* The % change to force sync */
};
struct gfs2_tune {
@@ -558,6 +562,7 @@ struct gfs2_sbd {
spinlock_t sd_statfs_spin;
struct gfs2_statfs_change_host sd_statfs_master;
struct gfs2_statfs_change_host sd_statfs_local;
+ int sd_statfs_force_sync;
/* Resource group stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fb15d3b1f40..26ba2a4c4a2 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -871,7 +871,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
- error = gfs2_acl_create(dip, GFS2_I(inode));
+ error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock2;
@@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
- str->di_header.__pad0 = 0;
str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
- str->di_header.__pad1 = 0;
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f6..4511b08fc45 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
memset(lh, 0, sizeof(struct gfs2_log_header));
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.__pad0 = cpu_to_be64(0);
lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
lh->lh_flags = cpu_to_be32(flags);
lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5..de97632ba32 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
{
struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+ struct gfs2_meta_header *mh;
struct gfs2_trans *tr;
lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
gfs2_meta_check(sdp, bd->bd_bh);
gfs2_pin(sdp, bd->bd_bh);
+ mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+ mh->__pad0 = cpu_to_be64(0);
+ mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
sdp->sd_log_num_buf++;
list_add(&le->le_list, &sdp->sd_log_le_buf);
tr->tr_num_buf_new++;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d08..5b31f7741a8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
if (error)
goto fail_unregister;
- error = slow_work_register_user();
+ error = slow_work_register_user(THIS_MODULE);
if (error)
goto fail_slow;
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
- slow_work_unregister_user();
+ slow_work_unregister_user(THIS_MODULE);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fb6c04898..edfee24f363 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
#include <linux/mount.h>
#include <linux/gfs2_ondisk.h>
#include <linux/slow-work.h>
+#include <linux/quotaops.h>
#include "gfs2.h"
#include "incore.h"
@@ -62,13 +63,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_quota_warn_period = 10;
gt->gt_quota_scale_num = 1;
gt->gt_quota_scale_den = 1;
- gt->gt_quota_quantum = 60;
gt->gt_new_files_jdata = 0;
gt->gt_max_readahead = 1 << 18;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
- gt->gt_statfs_quantum = 30;
- gt->gt_statfs_slow = 0;
}
static struct gfs2_sbd *init_sbd(struct super_block *sb)
@@ -1114,7 +1112,7 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp)
* Returns: errno
*/
-static int fill_super(struct super_block *sb, void *data, int silent)
+static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
{
struct gfs2_sbd *sdp;
struct gfs2_holder mount_gh;
@@ -1125,17 +1123,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
return -ENOMEM;
}
-
- sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
- sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
- sdp->sd_args.ar_commit = 60;
- sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
-
- error = gfs2_mount_args(sdp, &sdp->sd_args, data);
- if (error) {
- printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
- goto fail;
- }
+ sdp->sd_args = *args;
if (sdp->sd_args.ar_spectator) {
sb->s_flags |= MS_RDONLY;
@@ -1143,11 +1131,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
}
if (sdp->sd_args.ar_posix_acl)
sb->s_flags |= MS_POSIXACL;
+ if (sdp->sd_args.ar_nobarrier)
+ set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_export_op = &gfs2_export_ops;
sb->s_xattr = gfs2_xattr_handlers;
+ sb->s_qcop = &gfs2_quotactl_ops;
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
sb->s_time_gran = 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1160,6 +1152,15 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+ sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
+ if (sdp->sd_args.ar_statfs_quantum) {
+ sdp->sd_tune.gt_statfs_slow = 0;
+ sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
+ }
+ else {
+ sdp->sd_tune.gt_statfs_slow = 1;
+ sdp->sd_tune.gt_statfs_quantum = 30;
+ }
error = init_names(sdp, silent);
if (error)
@@ -1243,18 +1244,127 @@ fail:
return error;
}
-static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data, struct vfsmount *mnt)
+static int set_gfs2_super(struct super_block *s, void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+ s->s_bdev = data;
+ s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+ return 0;
}
-static int test_meta_super(struct super_block *s, void *ptr)
+static int test_gfs2_super(struct super_block *s, void *ptr)
{
struct block_device *bdev = ptr;
return (bdev == s->s_bdev);
}
+/**
+ * gfs2_get_sb - Get the GFS2 superblock
+ * @fs_type: The GFS2 filesystem type
+ * @flags: Mount flags
+ * @dev_name: The name of the device
+ * @data: The mount arguments
+ * @mnt: The vfsmnt for this mount
+ *
+ * Q. Why not use get_sb_bdev() ?
+ * A. We need to select one of two root directories to mount, independent
+ * of whether this is the initial, or subsequent, mount of this sb
+ *
+ * Returns: 0 or -ve on error
+ */
+
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ struct block_device *bdev;
+ struct super_block *s;
+ fmode_t mode = FMODE_READ;
+ int error;
+ struct gfs2_args args;
+ struct gfs2_sbd *sdp;
+
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = -EBUSY;
+ goto error_bdev;
+ }
+ s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ error = PTR_ERR(s);
+ if (IS_ERR(s))
+ goto error_bdev;
+
+ memset(&args, 0, sizeof(args));
+ args.ar_quota = GFS2_QUOTA_DEFAULT;
+ args.ar_data = GFS2_DATA_DEFAULT;
+ args.ar_commit = 60;
+ args.ar_statfs_quantum = 30;
+ args.ar_quota_quantum = 60;
+ args.ar_errors = GFS2_ERRORS_DEFAULT;
+
+ error = gfs2_mount_args(&args, data);
+ if (error) {
+ printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+ if (s->s_root)
+ goto error_super;
+ deactivate_locked_super(s);
+ return error;
+ }
+
+ if (s->s_root) {
+ error = -EBUSY;
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ goto error_super;
+ close_bdev_exclusive(bdev, mode);
+ } else {
+ char b[BDEVNAME_SIZE];
+
+ s->s_flags = flags;
+ s->s_mode = mode;
+ strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ sb_set_blocksize(s, block_size(bdev));
+ error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(s);
+ return error;
+ }
+ s->s_flags |= MS_ACTIVE;
+ bdev->bd_super = s;
+ }
+
+ sdp = s->s_fs_info;
+ mnt->mnt_sb = s;
+ if (args.ar_meta)
+ mnt->mnt_root = dget(sdp->sd_master_dir);
+ else
+ mnt->mnt_root = dget(sdp->sd_root_dir);
+ return 0;
+
+error_super:
+ deactivate_locked_super(s);
+error_bdev:
+ close_bdev_exclusive(bdev, mode);
+ return error;
+}
+
static int set_meta_super(struct super_block *s, void *ptr)
{
return -EINVAL;
@@ -1274,13 +1384,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
dev_name, error);
return error;
}
- s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+ s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
path.dentry->d_inode->i_sb->s_bdev);
path_put(&path);
if (IS_ERR(s)) {
printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
return PTR_ERR(s);
}
+ if ((flags ^ s->s_flags) & MS_RDONLY) {
+ deactivate_locked_super(s);
+ return -EBUSY;
+ }
sdp = s->s_fs_info;
mnt->mnt_sb = s;
mnt->mnt_root = dget(sdp->sd_master_dir);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 2e9b9326bfc..e3bf6eab875 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -15,7 +15,7 @@
* fuzziness in the current usage value of IDs that are being used on different
* nodes in the cluster simultaneously. So, it is possible for a user on
* multiple nodes to overrun their quota, but that overrun is controlable.
- * Since quota tags are part of transactions, there is no need to a quota check
+ * Since quota tags are part of transactions, there is no need for a quota check
* program to be run on node crashes or anything like that.
*
* There are couple of knobs that let the administrator manage the quota
@@ -47,6 +47,8 @@
#include <linux/gfs2_ondisk.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/quota.h>
+#include <linux/dqblk_xfs.h>
#include "gfs2.h"
#include "incore.h"
@@ -65,13 +67,6 @@
#define QUOTA_USER 1
#define QUOTA_GROUP 0
-struct gfs2_quota_host {
- u64 qu_limit;
- u64 qu_warn;
- s64 qu_value;
- u32 qu_ll_next;
-};
-
struct gfs2_quota_change_host {
u64 qc_change;
u32 qc_flags; /* GFS2_QCF_... */
@@ -164,7 +159,7 @@ fail:
return error;
}
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -202,7 +197,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
spin_unlock(&qd_lru_lock);
- if (qd || !create) {
+ if (qd) {
if (new_qd) {
gfs2_glock_put(new_qd->qd_gl);
kmem_cache_free(gfs2_quotad_cachep, new_qd);
@@ -461,12 +456,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
qd_put(qd);
}
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data **qdp)
{
int error;
- error = qd_get(sdp, user, id, create, qdp);
+ error = qd_get(sdp, user, id, qdp);
if (error)
return error;
@@ -508,20 +503,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
if (error)
goto out;
al->al_qd_num++;
qd++;
- error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
if (error)
goto out;
al->al_qd_num++;
qd++;
if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
- error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_USER, uid, qd);
if (error)
goto out;
al->al_qd_num++;
@@ -529,7 +524,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
}
if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
- error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+ error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
if (error)
goto out;
al->al_qd_num++;
@@ -617,48 +612,36 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
mutex_unlock(&sdp->sd_quota_mutex);
}
-static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
-{
- const struct gfs2_quota *str = buf;
-
- qu->qu_limit = be64_to_cpu(str->qu_limit);
- qu->qu_warn = be64_to_cpu(str->qu_warn);
- qu->qu_value = be64_to_cpu(str->qu_value);
- qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
-}
-
-static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
-{
- struct gfs2_quota *str = buf;
-
- str->qu_limit = cpu_to_be64(qu->qu_limit);
- str->qu_warn = cpu_to_be64(qu->qu_warn);
- str->qu_value = cpu_to_be64(qu->qu_value);
- str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
- memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
-}
-
/**
- * gfs2_adjust_quota
+ * gfs2_adjust_quota - adjust record of current block usage
+ * @ip: The quota inode
+ * @loc: Offset of the entry in the quota file
+ * @change: The amount of usage change to record
+ * @qd: The quota data
+ * @fdq: The updated limits to record
*
* This function was mostly borrowed from gfs2_block_truncate_page which was
* in turn mostly borrowed from ext3
+ *
+ * Returns: 0 or -ve on error
*/
+
static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
- s64 change, struct gfs2_quota_data *qd)
+ s64 change, struct gfs2_quota_data *qd,
+ struct fs_disk_quota *fdq)
{
struct inode *inode = &ip->i_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long index = loc >> PAGE_CACHE_SHIFT;
unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
unsigned blocksize, iblock, pos;
- struct buffer_head *bh;
+ struct buffer_head *bh, *dibh;
struct page *page;
void *kaddr;
- char *ptr;
- struct gfs2_quota_host qp;
+ struct gfs2_quota *qp;
s64 value;
int err = -EIO;
+ u64 size;
if (gfs2_is_stuffed(ip))
gfs2_unstuff_dinode(ip, NULL);
@@ -700,18 +683,38 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
gfs2_trans_add_bh(ip->i_gl, bh, 0);
kaddr = kmap_atomic(page, KM_USER0);
- ptr = kaddr + offset;
- gfs2_quota_in(&qp, ptr);
- qp.qu_value += change;
- value = qp.qu_value;
- gfs2_quota_out(&qp, ptr);
+ qp = kaddr + offset;
+ value = (s64)be64_to_cpu(qp->qu_value) + change;
+ qp->qu_value = cpu_to_be64(value);
+ qd->qd_qb.qb_value = qp->qu_value;
+ if (fdq) {
+ if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+ qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+ qd->qd_qb.qb_warn = qp->qu_warn;
+ }
+ if (fdq->d_fieldmask & FS_DQ_BHARD) {
+ qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+ qd->qd_qb.qb_limit = qp->qu_limit;
+ }
+ }
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
- err = 0;
- qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
- qd->qd_qb.qb_value = cpu_to_be64(value);
- ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC);
- ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value);
+
+ err = gfs2_meta_inode_buffer(ip, &dibh);
+ if (err)
+ goto unlock;
+
+ size = loc + sizeof(struct gfs2_quota);
+ if (size > inode->i_size) {
+ ip->i_disksize = size;
+ i_size_write(inode, size);
+ }
+ inode->i_mtime = inode->i_atime = CURRENT_TIME;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ mark_inode_dirty(inode);
+
unlock:
unlock_page(page);
page_cache_release(page);
@@ -739,9 +742,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
return -ENOMEM;
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+ mutex_lock_nested(&ip->i_inode.i_mutex, I_MUTEX_QUOTA);
for (qx = 0; qx < num_qd; qx++) {
- error = gfs2_glock_nq_init(qda[qx]->qd_gl,
- LM_ST_EXCLUSIVE,
+ error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
if (error)
goto out;
@@ -795,9 +798,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
for (x = 0; x < num_qd; x++) {
qd = qda[x];
offset = qd2offset(qd);
- error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
- (struct gfs2_quota_data *)
- qd);
+ error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
if (error)
goto out_end_trans;
@@ -817,21 +818,44 @@ out_gunlock:
out:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
+ mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
return error;
}
+static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_quota q;
+ struct gfs2_quota_lvb *qlvb;
+ loff_t pos;
+ int error;
+
+ memset(&q, 0, sizeof(struct gfs2_quota));
+ pos = qd2offset(qd);
+ error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q));
+ if (error < 0)
+ return error;
+
+ qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+ qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+ qlvb->__pad = 0;
+ qlvb->qb_limit = q.qu_limit;
+ qlvb->qb_warn = q.qu_warn;
+ qlvb->qb_value = q.qu_value;
+ qd->qd_qb = *qlvb;
+
+ return 0;
+}
+
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
- struct gfs2_quota_host q;
- char buf[sizeof(struct gfs2_quota)];
int error;
- struct gfs2_quota_lvb *qlvb;
restart:
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
@@ -841,11 +865,9 @@ restart:
qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
- loff_t pos;
gfs2_glock_dq_uninit(q_gh);
- error = gfs2_glock_nq_init(qd->qd_gl,
- LM_ST_EXCLUSIVE, GL_NOCACHE,
- q_gh);
+ error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, q_gh);
if (error)
return error;
@@ -853,29 +875,14 @@ restart:
if (error)
goto fail;
- memset(buf, 0, sizeof(struct gfs2_quota));
- pos = qd2offset(qd);
- error = gfs2_internal_read(ip, NULL, buf, &pos,
- sizeof(struct gfs2_quota));
- if (error < 0)
+ error = update_qd(sdp, qd);
+ if (error)
goto fail_gunlock;
gfs2_glock_dq_uninit(&i_gh);
-
- gfs2_quota_in(&q, buf);
- qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
- qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
- qlvb->__pad = 0;
- qlvb->qb_limit = cpu_to_be64(q.qu_limit);
- qlvb->qb_warn = cpu_to_be64(q.qu_warn);
- qlvb->qb_value = cpu_to_be64(q.qu_value);
- qd->qd_qb = *qlvb;
-
- if (gfs2_glock_is_blocking(qd->qd_gl)) {
- gfs2_glock_dq_uninit(q_gh);
- force_refresh = 0;
- goto restart;
- }
+ gfs2_glock_dq_uninit(q_gh);
+ force_refresh = 0;
+ goto restart;
}
return 0;
@@ -995,7 +1002,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
- printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+ printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
sdp->sd_fsname, type,
(test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
qd->qd_id);
@@ -1032,6 +1039,10 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
print_message(qd, "exceeded");
+ quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+ USRQUOTA : GRPQUOTA, qd->qd_id,
+ sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
+
error = -EDQUOT;
break;
} else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
@@ -1039,6 +1050,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
time_after_eq(jiffies, qd->qd_last_warn +
gfs2_tune_get(sdp,
gt_quota_warn_period) * HZ)) {
+ quota_send_warning(test_bit(QDF_USER, &qd->qd_flags) ?
+ USRQUOTA : GRPQUOTA, qd->qd_id,
+ sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
error = print_message(qd, "warning");
qd->qd_last_warn = jiffies;
}
@@ -1069,8 +1083,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
}
}
-int gfs2_quota_sync(struct gfs2_sbd *sdp)
+int gfs2_quota_sync(struct super_block *sb, int type)
{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
unsigned int num_qd;
@@ -1118,7 +1133,7 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
struct gfs2_holder q_gh;
int error;
- error = qd_get(sdp, user, id, CREATE, &qd);
+ error = qd_get(sdp, user, id, &qd);
if (error)
return error;
@@ -1127,7 +1142,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
gfs2_glock_dq_uninit(&q_gh);
qd_put(qd);
-
return error;
}
@@ -1298,12 +1312,12 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
}
static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
- int (*fxn)(struct gfs2_sbd *sdp),
+ int (*fxn)(struct super_block *sb, int type),
unsigned long t, unsigned long *timeo,
unsigned int *new_timeo)
{
if (t >= *timeo) {
- int error = fxn(sdp);
+ int error = fxn(sdp->sd_vfs, 0);
quotad_error(sdp, msg, error);
*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
} else {
@@ -1330,6 +1344,14 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
}
}
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
+ if (!sdp->sd_statfs_force_sync) {
+ sdp->sd_statfs_force_sync = 1;
+ wake_up(&sdp->sd_quota_wait);
+ }
+}
+
+
/**
* gfs2_quotad - Write cached quota changes into the quota file
* @sdp: Pointer to GFS2 superblock
@@ -1349,8 +1371,15 @@ int gfs2_quotad(void *data)
while (!kthread_should_stop()) {
/* Update the master statfs file */
- quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
- &statfs_timeo, &tune->gt_statfs_quantum);
+ if (sdp->sd_statfs_force_sync) {
+ int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
+ quotad_error(sdp, "statfs", error);
+ statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+ }
+ else
+ quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+ &statfs_timeo,
+ &tune->gt_statfs_quantum);
/* Update quota file */
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
@@ -1367,7 +1396,7 @@ int gfs2_quotad(void *data)
spin_lock(&sdp->sd_trunc_lock);
empty = list_empty(&sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
- if (empty)
+ if (empty && !sdp->sd_statfs_force_sync)
t -= schedule_timeout(t);
else
t = 0;
@@ -1377,3 +1406,181 @@ int gfs2_quotad(void *data)
return 0;
}
+static int gfs2_quota_get_xstate(struct super_block *sb,
+ struct fs_quota_stat *fqs)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ memset(fqs, 0, sizeof(struct fs_quota_stat));
+ fqs->qs_version = FS_QSTAT_VERSION;
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
+ fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+ else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+ fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+ if (sdp->sd_quota_inode) {
+ fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+ fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+ }
+ fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
+ fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
+ fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+ return 0;
+}
+
+static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_quota_lvb *qlvb;
+ struct gfs2_quota_data *qd;
+ struct gfs2_holder q_gh;
+ int error;
+
+ memset(fdq, 0, sizeof(struct fs_disk_quota));
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return -ESRCH; /* Crazy XFS error code */
+
+ if (type == USRQUOTA)
+ type = QUOTA_USER;
+ else if (type == GRPQUOTA)
+ type = QUOTA_GROUP;
+ else
+ return -EINVAL;
+
+ error = qd_get(sdp, type, id, &qd);
+ if (error)
+ return error;
+ error = do_glock(qd, FORCE, &q_gh);
+ if (error)
+ goto out;
+
+ qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+ fdq->d_version = FS_DQUOT_VERSION;
+ fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+ fdq->d_id = id;
+ fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+ fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+ fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+
+ gfs2_glock_dq_uninit(&q_gh);
+out:
+ qd_put(qd);
+ return error;
+}
+
+/* GFS2 only supports a subset of the XFS fields */
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+
+static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_quota_data *qd;
+ struct gfs2_holder q_gh, i_gh;
+ unsigned int data_blocks, ind_blocks;
+ unsigned int blocks = 0;
+ int alloc_required;
+ struct gfs2_alloc *al;
+ loff_t offset;
+ int error;
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return -ESRCH; /* Crazy XFS error code */
+
+ switch(type) {
+ case USRQUOTA:
+ type = QUOTA_USER;
+ if (fdq->d_flags != XFS_USER_QUOTA)
+ return -EINVAL;
+ break;
+ case GRPQUOTA:
+ type = QUOTA_GROUP;
+ if (fdq->d_flags != XFS_GROUP_QUOTA)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
+ return -EINVAL;
+ if (fdq->d_id != id)
+ return -EINVAL;
+
+ error = qd_get(sdp, type, id, &qd);
+ if (error)
+ return error;
+
+ mutex_lock(&ip->i_inode.i_mutex);
+ error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
+ if (error)
+ goto out_put;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ goto out_q;
+
+ /* Check for existing entry, if none then alloc new blocks */
+ error = update_qd(sdp, qd);
+ if (error)
+ goto out_i;
+
+ /* If nothing has changed, this is a no-op */
+ if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+ (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+ fdq->d_fieldmask ^= FS_DQ_BSOFT;
+ if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+ (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+ fdq->d_fieldmask ^= FS_DQ_BHARD;
+ if (fdq->d_fieldmask == 0)
+ goto out_i;
+
+ offset = qd2offset(qd);
+ error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+ &alloc_required);
+ if (error)
+ goto out_i;
+ if (alloc_required) {
+ al = gfs2_alloc_get(ip);
+ if (al == NULL)
+ goto out_i;
+ gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+ &data_blocks, &ind_blocks);
+ blocks = al->al_requested = 1 + data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_alloc;
+ }
+
+ error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+ if (error)
+ goto out_release;
+
+ /* Apply changes */
+ error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+
+ gfs2_trans_end(sdp);
+out_release:
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ }
+out_i:
+ gfs2_glock_dq_uninit(&i_gh);
+out_q:
+ gfs2_glock_dq_uninit(&q_gh);
+out_put:
+ mutex_unlock(&ip->i_inode.i_mutex);
+ qd_put(qd);
+ return error;
+}
+
+const struct quotactl_ops gfs2_quotactl_ops = {
+ .quota_sync = gfs2_quota_sync,
+ .get_xstate = gfs2_quota_get_xstate,
+ .get_xquota = gfs2_xquota_get,
+ .set_xquota = gfs2_xquota_set,
+};
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0fa5fa63d0e..e271fa07ad0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,13 +25,15 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid);
-extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct super_block *sb, int type);
extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
extern int gfs2_quota_init(struct gfs2_sbd *sdp);
extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
extern int gfs2_quotad(void *data);
+extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+
static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -50,5 +52,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
}
extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d..4b9bece3d43 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -7,6 +7,7 @@
* of the GNU General Public License version 2.
*/
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -409,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
memset(lh, 0, sizeof(struct gfs2_log_header));
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.__pad0 = cpu_to_be64(0);
lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
lh->lh_blkno = cpu_to_be32(lblock);
@@ -593,6 +596,7 @@ fail:
}
struct slow_work_ops gfs2_recover_ops = {
+ .owner = THIS_MODULE,
.get_ref = gfs2_recover_get_ref,
.put_ref = gfs2_recover_put_ref,
.execute = gfs2_recover_work,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8f1cfb02a6c..0608f490c29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1710,11 +1710,16 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
{
struct gfs2_rgrpd *rgd;
struct gfs2_holder ri_gh, rgd_gh;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+ int ri_locked = 0;
int error;
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto fail;
+ if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto fail;
+ ri_locked = 1;
+ }
error = -EINVAL;
rgd = gfs2_blk2rgrpd(sdp, no_addr);
@@ -1730,7 +1735,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
gfs2_glock_dq_uninit(&rgd_gh);
fail_rindex:
- gfs2_glock_dq_uninit(&ri_gh);
+ if (ri_locked)
+ gfs2_glock_dq_uninit(&ri_gh);
fail:
return error;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0ec3ec672de..c282ad41f3d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -70,6 +70,11 @@ enum {
Opt_commit,
Opt_err_withdraw,
Opt_err_panic,
+ Opt_statfs_quantum,
+ Opt_statfs_percent,
+ Opt_quota_quantum,
+ Opt_barrier,
+ Opt_nobarrier,
Opt_error,
};
@@ -101,18 +106,23 @@ static const match_table_t tokens = {
{Opt_commit, "commit=%d"},
{Opt_err_withdraw, "errors=withdraw"},
{Opt_err_panic, "errors=panic"},
+ {Opt_statfs_quantum, "statfs_quantum=%d"},
+ {Opt_statfs_percent, "statfs_percent=%d"},
+ {Opt_quota_quantum, "quota_quantum=%d"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_error, NULL}
};
/**
* gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
+ * @args: The structure into which the parsed options will be written
+ * @options: The options to parse
*
* Return: errno
*/
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+int gfs2_mount_args(struct gfs2_args *args, char *options)
{
char *o;
int token;
@@ -157,7 +167,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
break;
case Opt_debug:
if (args->ar_errors == GFS2_ERRORS_PANIC) {
- fs_info(sdp, "-o debug and -o errors=panic "
+ printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
"are mutually exclusive.\n");
return -EINVAL;
}
@@ -210,7 +220,29 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
case Opt_commit:
rv = match_int(&tmp[0], &args->ar_commit);
if (rv || args->ar_commit <= 0) {
- fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+ printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_statfs_quantum:
+ rv = match_int(&tmp[0], &args->ar_statfs_quantum);
+ if (rv || args->ar_statfs_quantum < 0) {
+ printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_quota_quantum:
+ rv = match_int(&tmp[0], &args->ar_quota_quantum);
+ if (rv || args->ar_quota_quantum <= 0) {
+ printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+ return rv ? rv : -EINVAL;
+ }
+ break;
+ case Opt_statfs_percent:
+ rv = match_int(&tmp[0], &args->ar_statfs_percent);
+ if (rv || args->ar_statfs_percent < 0 ||
+ args->ar_statfs_percent > 100) {
+ printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
return rv ? rv : -EINVAL;
}
break;
@@ -219,15 +251,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
break;
case Opt_err_panic:
if (args->ar_debug) {
- fs_info(sdp, "-o debug and -o errors=panic "
+ printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
"are mutually exclusive.\n");
return -EINVAL;
}
args->ar_errors = GFS2_ERRORS_PANIC;
break;
+ case Opt_barrier:
+ args->ar_nobarrier = 0;
+ break;
+ case Opt_nobarrier:
+ args->ar_nobarrier = 1;
+ break;
case Opt_error:
default:
- fs_info(sdp, "invalid mount option: %s\n", o);
+ printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
return -EINVAL;
}
}
@@ -442,7 +480,10 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
{
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+ struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
struct buffer_head *l_bh;
+ s64 x, y;
+ int need_sync = 0;
int error;
error = gfs2_meta_inode_buffer(l_ip, &l_bh);
@@ -456,9 +497,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
l_sc->sc_free += free;
l_sc->sc_dinodes += dinodes;
gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+ if (sdp->sd_args.ar_statfs_percent) {
+ x = 100 * l_sc->sc_free;
+ y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
+ if (x >= y || x <= -y)
+ need_sync = 1;
+ }
spin_unlock(&sdp->sd_statfs_spin);
brelse(l_bh);
+ if (need_sync)
+ gfs2_wake_up_statfs(sdp);
}
void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
@@ -484,8 +533,9 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
}
-int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+int gfs2_statfs_sync(struct super_block *sb, int type)
{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
@@ -521,6 +571,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
goto out_bh2;
update_statfs(sdp, m_bh, l_bh);
+ sdp->sd_statfs_force_sync = 0;
gfs2_trans_end(sdp);
@@ -712,8 +763,8 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
int error;
flush_workqueue(gfs2_delete_workqueue);
- gfs2_quota_sync(sdp);
- gfs2_statfs_sync(sdp);
+ gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
&t_gh);
@@ -1061,8 +1112,13 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
spin_lock(&gt->gt_spin);
args.ar_commit = gt->gt_log_flush_secs;
+ args.ar_quota_quantum = gt->gt_quota_quantum;
+ if (gt->gt_statfs_slow)
+ args.ar_statfs_quantum = 0;
+ else
+ args.ar_statfs_quantum = gt->gt_statfs_quantum;
spin_unlock(&gt->gt_spin);
- error = gfs2_mount_args(sdp, &args, data);
+ error = gfs2_mount_args(&args, data);
if (error)
return error;
@@ -1097,8 +1153,21 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
sb->s_flags |= MS_POSIXACL;
else
sb->s_flags &= ~MS_POSIXACL;
+ if (sdp->sd_args.ar_nobarrier)
+ set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+ else
+ clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
spin_lock(&gt->gt_spin);
gt->gt_log_flush_secs = args.ar_commit;
+ gt->gt_quota_quantum = args.ar_quota_quantum;
+ if (args.ar_statfs_quantum) {
+ gt->gt_statfs_slow = 0;
+ gt->gt_statfs_quantum = args.ar_statfs_quantum;
+ }
+ else {
+ gt->gt_statfs_slow = 1;
+ gt->gt_statfs_quantum = 30;
+ }
spin_unlock(&gt->gt_spin);
gfs2_online_uevent(sdp);
@@ -1179,7 +1248,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
{
struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
struct gfs2_args *args = &sdp->sd_args;
- int lfsecs;
+ int val;
if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
seq_printf(s, ",meta");
@@ -1240,9 +1309,17 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
}
if (args->ar_discard)
seq_printf(s, ",discard");
- lfsecs = sdp->sd_tune.gt_log_flush_secs;
- if (lfsecs != 60)
- seq_printf(s, ",commit=%d", lfsecs);
+ val = sdp->sd_tune.gt_log_flush_secs;
+ if (val != 60)
+ seq_printf(s, ",commit=%d", val);
+ val = sdp->sd_tune.gt_statfs_quantum;
+ if (val != 30)
+ seq_printf(s, ",statfs_quantum=%d", val);
+ val = sdp->sd_tune.gt_quota_quantum;
+ if (val != 60)
+ seq_printf(s, ",quota_quantum=%d", val);
+ if (args->ar_statfs_percent)
+ seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
const char *state;
@@ -1259,6 +1336,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
}
seq_printf(s, ",errors=%s", state);
}
+ if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+ seq_printf(s, ",nobarrier");
+
return 0;
}
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 235db368288..3df60f2d84e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -27,7 +27,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
-extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
+extern int gfs2_mount_args(struct gfs2_args *args, char *data);
extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
@@ -44,7 +44,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
const void *buf);
extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct buffer_head *l_bh);
-extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_sync(struct super_block *sb, int type);
extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 446329728d5..c5dad1eb7b9 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -158,7 +158,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_statfs_sync(sdp);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
return len;
}
@@ -171,13 +171,14 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_quota_sync(sdp);
+ gfs2_quota_sync(sdp->sd_vfs, 0);
return len;
}
static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
@@ -185,13 +186,14 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- gfs2_quota_refresh(sdp, 1, id);
- return len;
+ error = gfs2_quota_refresh(sdp, 1, id);
+ return error ? error : len;
}
static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error;
u32 id;
if (!capable(CAP_SYS_ADMIN))
@@ -199,8 +201,8 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
id = simple_strtoul(buf, NULL, 0);
- gfs2_quota_refresh(sdp, 0, id);
- return len;
+ error = gfs2_quota_refresh(sdp, 0, id);
+ return error ? error : len;
}
static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a0f8ef6ee2..912f5cbc474 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -186,8 +186,8 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
return 0;
}
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
- struct gfs2_ea_location *el)
+static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+ struct gfs2_ea_location *el)
{
struct ea_find ef;
int error;
@@ -516,8 +516,8 @@ out:
return error;
}
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- char *data, size_t size)
+static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data, size_t size)
{
int ret;
size_t len = GFS2_EA_DATA_LEN(el->el_ea);
@@ -534,6 +534,36 @@ int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
return len;
}
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata)
+{
+ struct gfs2_ea_location el;
+ int error;
+ int len;
+ char *data;
+
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el);
+ if (error)
+ return error;
+ if (!el.el_ea)
+ goto out;
+ if (!GFS2_EA_DATA_LEN(el.el_ea))
+ goto out;
+
+ len = GFS2_EA_DATA_LEN(el.el_ea);
+ data = kmalloc(len, GFP_NOFS);
+ error = -ENOMEM;
+ if (data == NULL)
+ goto out;
+
+ error = gfs2_ea_get_copy(ip, &el, data, len);
+ if (error == 0)
+ error = len;
+ *ppdata = data;
+out:
+ brelse(el.el_bh);
+ return error;
+}
+
/**
* gfs2_xattr_get - Get a GFS2 extended attribute
* @inode: The inode
@@ -1259,22 +1289,26 @@ fail:
return error;
}
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data)
+int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
+ struct gfs2_ea_location el;
struct buffer_head *dibh;
int error;
- if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
+ if (error)
+ return error;
+
+ if (GFS2_EA_IS_STUFFED(el.el_ea)) {
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
if (error)
return error;
- gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
- memcpy(GFS2_EA2DATA(el->el_ea), data,
- GFS2_EA_DATA_LEN(el->el_ea));
+ gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+ memcpy(GFS2_EA2DATA(el.el_ea), data,
+ GFS2_EA_DATA_LEN(el.el_ea));
} else
- error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+ error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
if (error)
return error;
@@ -1507,18 +1541,6 @@ static int gfs2_xattr_user_set(struct inode *inode, const char *name,
return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
}
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-
static int gfs2_xattr_security_get(struct inode *inode, const char *name,
void *buffer, size_t size)
{
@@ -1543,12 +1565,6 @@ static struct xattr_handler gfs2_xattr_security_handler = {
.set = gfs2_xattr_security_set,
};
-static struct xattr_handler gfs2_xattr_system_handler = {
- .prefix = XATTR_SYSTEM_PREFIX,
- .get = gfs2_xattr_system_get,
- .set = gfs2_xattr_system_set,
-};
-
struct xattr_handler *gfs2_xattr_handlers[] = {
&gfs2_xattr_user_handler,
&gfs2_xattr_security_handler,
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index cbdfd774373..8d6ae5813c4 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,11 +62,7 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
- struct gfs2_ea_location *el);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data);
+extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9b9d6395bad..052f214ea6f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
}
unlock_new_inode(tree->inode);
+ if (!HFS_I(tree->inode)->first_blocks) {
+ printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
+ goto free_inode;
+ }
+
mapping = tree->inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 175d08eacc8..bed78ac8f6d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (hfsplus_get_last_session(sb, &part_start, &part_size))
return -EINVAL;
+ if ((u64)part_start + part_size > 0x100000000ULL) {
+ pr_err("hfs: volumes larger than 2TB are not supported yet\n");
+ return -EINVAL;
+ }
while (1) {
bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
if (!bh)
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be5597..06c1f02de61 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,7 +18,6 @@
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
-#include <linux/ima.h>
#include <linux/pagemap.h>
#include <linux/cdev.h>
#include <linux/bootmem.h>
@@ -157,11 +156,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (security_inode_alloc(inode))
goto out;
-
- /* allocate and initialize an i_integrity */
- if (ima_inode_alloc(inode))
- goto out_free_security;
-
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -201,9 +195,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
#endif
return 0;
-
-out_free_security:
- security_inode_free(inode);
out:
return -ENOMEM;
}
@@ -235,7 +226,6 @@ static struct inode *alloc_inode(struct super_block *sb)
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
- ima_inode_free(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
#ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7b17a14396f..6c751106c2e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode,
u64 len, get_block_t *get_block)
{
struct buffer_head tmp;
- unsigned int start_blk;
+ unsigned long long start_blk;
long long length = 0, map_len = 0;
u64 logical = 0, phys = 0, size = 0;
u32 flags = FIEMAP_EXTENT_MERGED;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd3c073b485..4160afad6d0 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno);
EXPORT_SYMBOL(journal_ack_err);
EXPORT_SYMBOL(journal_clear_err);
EXPORT_SYMBOL(log_wait_commit);
+EXPORT_SYMBOL(log_start_commit);
EXPORT_SYMBOL(journal_start_commit);
EXPORT_SYMBOL(journal_force_commit_nested);
EXPORT_SYMBOL(journal_wipe);
@@ -756,6 +757,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
return journal;
out_err:
+ kfree(journal->j_wbuf);
kfree(journal);
return NULL;
}
@@ -820,6 +822,7 @@ journal_t * journal_init_inode (struct inode *inode)
return journal;
out_err:
+ kfree(journal->j_wbuf);
kfree(journal);
return NULL;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b0ab5219bec..fed85388ee8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -913,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
return journal;
out_err:
+ kfree(journal->j_wbuf);
jbd2_stats_proc_exit(journal);
kfree(journal);
return NULL;
@@ -986,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
return journal;
out_err:
+ kfree(journal->j_wbuf);
jbd2_stats_proc_exit(journal);
kfree(journal);
return NULL;
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index cfe05c1966a..3f39be1b045 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
/* XXX FIXME: Where a single physical node actually shows up in two
frags, we read it twice. Don't do that. */
- /* Now we're pointing at the first frag which overlaps our page */
+ /* Now we're pointing at the first frag which overlaps our page
+ * (or perhaps is before it, if we've been asked to read off the
+ * end of the file). */
while(offset < end) {
D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
- if (unlikely(!frag || frag->ofs > offset)) {
+ if (unlikely(!frag || frag->ofs > offset ||
+ frag->ofs + frag->size <= offset)) {
uint32_t holesize = end - offset;
- if (frag) {
+ if (frag && frag->ofs > offset) {
D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
holesize = min(holesize, frag->ofs - offset);
}
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4fd22..7d70d63ceb2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,16 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
+ /* ... and get the mountpoint */
+ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+ if (retval)
+ return retval;
+
+ retval = security_sb_mount(dev_name, &path,
+ type_page, flags, data_page);
+ if (retval)
+ goto dput_out;
+
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
@@ -1945,16 +1955,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
MS_STRICTATIME);
- /* ... and get the mountpoint */
- retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
- if (retval)
- return retval;
-
- retval = security_sb_mount(dev_name, &path,
- type_page, flags, data_page);
- if (retval)
- goto dput_out;
-
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 63976c0ccc2..99ea196f071 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1, flags & NFS_MOUNT_NORESVPORT);
if (error < 0)
goto error;
- memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
error = nfs_idmap_new(clp);
if (error < 0) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32062c33c85..7cb298525ee 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
+ nfs_inode_return_delegation(inode);
+
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6c3210099d5..e1d415e9784 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
};
struct rpc_task_setup task_setup_data = {
.rpc_client = NFS_CLIENT(inode),
+ .rpc_message = &msg,
.callback_ops = &nfs_write_direct_ops,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 70fad69eb95..fa588006588 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
BUG_ON(!cookie);
- if (fscache_check_page_write(cookie, page)) {
- if (!(gfp & __GFP_WAIT))
- return 0;
- fscache_wait_on_page_write(cookie, page);
- }
-
if (PageFsCache(page)) {
dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
cookie, page, nfsi);
- fscache_uncache_page(cookie, page);
+ if (!fscache_maybe_release_page(cookie, page, gfp))
+ return 0;
+
nfs_add_fscache_stats(page->mapping->host,
NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2636c26d56f..fa3408f2011 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
if (IS_ERR(mnt_path))
- return mnt;
+ return ERR_CAST(mnt_path);
mountdata->mnt_path = mnt_path;
maxbuflen = mnt_path - 1 - page2;
@@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
if (buf->len <= 0 || buf->len >= maxbuflen)
continue;
- mountdata->addr = (struct sockaddr *)&addr;
-
if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
continue;
- mountdata->addrlen = nfs_parse_server_name(buf->data,
- buf->len,
- mountdata->addr, mountdata->addrlen);
+
+ mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+ (struct sockaddr *)&addr, sizeof(addr));
if (mountdata->addrlen == 0)
continue;
+
+ mountdata->addr = (struct sockaddr *)&addr;
rpc_set_port(mountdata->addr, NFS_PORT);
memcpy(page2, buf->data, buf->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed7c269e251..741a562177f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
{
- if (err < -1000) {
+ if (err >= -1000)
+ return err;
+ switch (err) {
+ case -NFS4ERR_RESOURCE:
+ return -EREMOTEIO;
+ default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
- return -EIO;
+ break;
}
- return err;
+ return -EIO;
}
/*
@@ -2762,7 +2767,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.pages = &page,
.pgbase = 0,
.count = count,
- .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
+ .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
};
struct nfs4_readdir_res res;
struct rpc_message msg = {
@@ -3060,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
if (time_before(clp->cl_last_renewal,timestamp))
clp->cl_last_renewal = timestamp;
spin_unlock(&clp->cl_lock);
- dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
- task->tk_msg.rpc_cred);
- put_rpccred(task->tk_msg.rpc_cred);
}
static const struct rpc_call_ops nfs4_renew_ops = {
@@ -4877,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
- put_rpccred(task->tk_msg.rpc_cred);
kfree(task->tk_msg.rpc_argp);
kfree(task->tk_msg.rpc_resp);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6cef18f..0156c01c212 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
}
void
-nfs4_renewd_prepare_shutdown(struct nfs_server *server)
-{
- cancel_delayed_work(&server->nfs_client->cl_renewd);
-}
-
-void
nfs4_kill_renewd(struct nfs_client *clp)
{
cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 83ad47cbdd8..20b4e30e6c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5681,7 +5681,6 @@ static struct {
{ NFS4ERR_SERVERFAULT, -ESERVERFAULT },
{ NFS4ERR_BADTYPE, -EBADTYPE },
{ NFS4ERR_LOCKED, -EAGAIN },
- { NFS4ERR_RESOURCE, -EREMOTEIO },
{ NFS4ERR_SYMLINK, -ELOOP },
{ NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
{ NFS4ERR_DEADLOCK, -EDEADLK },
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29786d3b932..90be551b80c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb)
unlock_kernel();
}
-static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
{
struct nfs_parsed_mount_data *data;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data) {
- data->flags = flags;
data->rsize = NFS_MAX_FILE_IO_SIZE;
data->wsize = NFS_MAX_FILE_IO_SIZE;
data->acregmin = NFS_DEF_ACREGMIN;
data->acregmax = NFS_DEF_ACREGMAX;
data->acdirmin = NFS_DEF_ACDIRMIN;
data->acdirmax = NFS_DEF_ACDIRMAX;
+ data->mount_server.port = NFS_UNSPEC_PORT;
data->nfs_server.port = NFS_UNSPEC_PORT;
+ data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
data->auth_flavors[0] = RPC_AUTH_UNIX;
data->auth_flavor_len = 1;
+ data->version = version;
data->minorversion = 0;
}
return data;
@@ -776,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr)
* Select between a default port value and a user-specified port value.
* If a zero value is set, then autobind will be used.
*/
-static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+static void nfs_set_port(struct sockaddr *sap, int *port,
const unsigned short default_port)
{
- unsigned short port = default_port;
+ if (*port == NFS_UNSPEC_PORT)
+ *port = default_port;
- if (parsed_port != NFS_UNSPEC_PORT)
- port = parsed_port;
-
- rpc_set_port(sap, port);
+ rpc_set_port(sap, *port);
}
/*
@@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw,
default:
dfprintk(MOUNT, "NFS: unrecognized "
"transport protocol\n");
+ kfree(string);
return 0;
}
break;
@@ -1475,7 +1476,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
args->mount_server.addrlen = args->nfs_server.addrlen;
}
request.salen = args->mount_server.addrlen;
- nfs_set_default_port(request.sap, args->mount_server.port, 0);
+ nfs_set_port(request.sap, &args->mount_server.port, 0);
/*
* Now ask the mount server to map our export path
@@ -1711,8 +1712,6 @@ static int nfs_validate_mount_data(void *options,
if (!(data->flags & NFS_MOUNT_TCP))
args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- else
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
/* N.B. caller will free nfs_server.hostname in all cases */
args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
args->namlen = data->namlen;
@@ -1767,7 +1766,7 @@ static int nfs_validate_mount_data(void *options,
goto out_v4_not_compiled;
#endif
- nfs_set_default_port(sap, args->nfs_server.port, 0);
+ nfs_set_port(sap, &args->nfs_server.port, 0);
nfs_set_mount_transport_protocol(args);
@@ -1848,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
data->acdirmin != nfss->acdirmin / HZ ||
data->acdirmax != nfss->acdirmax / HZ ||
data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.port != nfss->port ||
data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
- memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
- data->nfs_server.addrlen) != 0)
+ !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+ (struct sockaddr *)&nfss->nfs_client->cl_addr))
return -EINVAL;
return 0;
@@ -1893,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->acdirmin = nfss->acdirmin / HZ;
data->acdirmax = nfss->acdirmax / HZ;
data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.port = nfss->port;
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
@@ -2106,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
};
int error = -ENOMEM;
- data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
+ data = nfs_alloc_parsed_mount_data(3);
mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
if (data == NULL || mntfh == NULL)
goto out_free_fh;
@@ -2331,7 +2332,7 @@ static int nfs4_validate_text_mount_data(void *options,
{
struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
- nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+ nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
nfs_validate_transport_protocol(args);
@@ -2376,7 +2377,6 @@ static int nfs4_validate_mount_data(void *options,
if (data == NULL)
goto out_no_data;
- args->version = 4;
switch (data->version) {
case 1:
if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2660,7 +2660,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
struct nfs_parsed_mount_data *data;
int error = -ENOMEM;
- data = nfs_alloc_parsed_mount_data(0);
+ data = nfs_alloc_parsed_mount_data(4);
if (data == NULL)
goto out_free_data;
@@ -2690,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb)
dprintk("--> %s\n", __func__);
nfs_super_return_all_delegations(sb);
kill_anon_super(sb);
- nfs4_renewd_prepare_shutdown(server);
nfs_fscache_release_super_cookie(sb);
nfs_free_server(server);
dprintk("<-- %s\n", __func__);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index edf926e1062..d0a2ce1b432 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
if (plus)
- p = encode_entryplus_baggage(cd, p1, name, namlen);
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5941958f1e4..84c25382f8e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -87,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
brelse(bh);
BUG();
}
+ memset(bh->b_data, 0, 1 << inode->i_blkbits);
bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
bh->b_blocknr = blocknr;
set_buffer_mapped(bh);
@@ -276,8 +277,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
- if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
- BUG();
+ nilfs_btnode_mark_dirty(obh);
spin_lock_irq(&btnc->tree_lock);
radix_tree_delete(&btnc->page_tree, oldkey);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 1c6cfb59128..3f5d5d06f53 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
* exclusive with a new mount job. Though it doesn't cover
* umount, it's enough for the purpose.
*/
- mutex_lock(&nilfs->ns_mount_mutex);
if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
/* Current implementation does not have to protect
plain read-only mounts since they are exclusive
@@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
ret = -EBUSY;
} else
ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
- mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
case NILFS_SNAPSHOT:
return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5040220c373..2a0a5a3ac13 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -664,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
int err;
spin_lock(&sbi->s_inode_lock);
- /* Caller of this function MUST lock s_inode_lock */
if (ii->i_bh == NULL) {
spin_unlock(&sbi->s_inode_lock);
err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6572ea4bc4d..f6af76042d8 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
- struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+ struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+ struct inode *cpfile = nilfs->ns_cpfile;
struct nilfs_transaction_info ti;
struct nilfs_cpmode cpmode;
int ret;
@@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
return -EFAULT;
+ mutex_lock(&nilfs->ns_mount_mutex);
nilfs_transaction_begin(inode->i_sb, &ti, 0);
ret = nilfs_cpfile_change_cpmode(
cpfile, cpmode.cm_cno, cpmode.cm_mode);
if (unlikely(ret < 0)) {
nilfs_transaction_abort(inode->i_sb);
+ mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
}
nilfs_transaction_commit(inode->i_sb); /* never fails */
+ mutex_unlock(&nilfs->ns_mount_mutex);
return ret;
}
@@ -297,7 +301,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
(unsigned long long)vdesc->vd_vblocknr);
return ret;
}
- bh->b_private = vdesc;
+ if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
+ printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
+ "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
+ brelse(bh);
+ return -EEXIST;
+ }
list_add_tail(&bh->b_assoc_buffers, buffers);
return 0;
}
@@ -335,24 +350,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
ret = nilfs_gccache_wait_and_mark_dirty(bh);
if (unlikely(ret < 0)) {
- if (ret == -EEXIST) {
- vdesc = bh->b_private;
- printk(KERN_CRIT
- "%s: conflicting %s buffer: "
- "ino=%llu, cno=%llu, offset=%llu, "
- "blocknr=%llu, vblocknr=%llu\n",
- __func__,
- vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
- }
+ WARN_ON(ret == -EEXIST);
goto failed;
}
list_del_init(&bh->b_assoc_buffers);
- bh->b_private = NULL;
brelse(bh);
}
return nmembs;
@@ -360,7 +361,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
failed:
list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
list_del_init(&bh->b_assoc_buffers);
- bh->b_private = NULL;
brelse(bh);
}
return ret;
@@ -471,7 +471,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- nilfs_remove_all_gcinode(nilfs);
printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
msg, ret);
return ret;
@@ -560,6 +559,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
else
ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+ if (ret < 0)
+ nilfs_remove_all_gcinode(nilfs);
clear_nilfs_gc_running(nilfs);
out_free:
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 683df89dbae..6eff66a070d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
/* Clear requests (even when the construction failed) */
spin_lock(&sci->sc_state_lock);
- sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
-
if (req->mode == SC_LSEG_SR) {
+ sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
sci->sc_seq_done = req->seq_accepted;
nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
sci->sc_flush_request = 0;
- } else if (req->mode == SC_FLUSH_FILE)
- sci->sc_flush_request &= ~FLUSH_FILE_BIT;
- else if (req->mode == SC_FLUSH_DAT)
- sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+ } else {
+ if (req->mode == SC_FLUSH_FILE)
+ sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+ else if (req->mode == SC_FLUSH_DAT)
+ sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+ /* re-enable timer if checkpoint creation was not done */
+ if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+ time_before(jiffies, sci->sc_timer->expires))
+ add_timer(sci->sc_timer);
+ }
spin_unlock(&sci->sc_state_lock);
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 828a889be90..7e54e52964d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct fown_struct *fown;
+ __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
to_tell = event->to_tell;
@@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
spin_lock(&entry->lock);
prev = &dnentry->dn;
while ((dn = *prev) != NULL) {
- if ((dn->dn_mask & event->mask) == 0) {
+ if ((dn->dn_mask & test_mask) == 0) {
prev = &dn->dn_next;
continue;
}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index c8a07c65482..3165d85aada 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
spin_lock(&group->mark_lock);
spin_lock(&inode->i_lock);
- entry->group = group;
- entry->inode = inode;
-
lentry = fsnotify_find_mark_entry(group, inode);
if (!lentry) {
+ entry->group = group;
+ entry->inode = inode;
+
hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
list_add(&entry->g_list, &group->mark_entries);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3816d5750dd..b8bf53b4c10 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
/* remember, after old was put on the wait_q we aren't
* allowed to look at the inode any more, only thing
* left to check was if the file_name is the same */
- if (old->name_len &&
+ if (!old->name_len ||
!strcmp(old->file_name, new->file_name))
return true;
break;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89fc8ee1f5a..de059f49058 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1712,7 +1712,8 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
struct super_block *sb = inode->i_sb;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
- !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index eae40460242..d963d863870 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -35,12 +35,7 @@
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/lockdep.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 60287fc56bc..3a0df7a1b81 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3743,6 +3743,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
goto out;
}
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto attach_xattr;
+
ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
size = i_size_read(inode);
@@ -3769,6 +3772,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
cpos += num_clusters;
}
+attach_xattr:
if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
&ref_tree->rf_ci,
@@ -3858,6 +3862,49 @@ out:
return ret;
}
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+ BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+ memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+ le16_to_cpu(s_di->id2.i_data.id_count));
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+ t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
static int ocfs2_duplicate_extent_list(struct inode *s_inode,
struct inode *t_inode,
struct buffer_head *t_bh,
@@ -3997,6 +4044,14 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
goto out;
}
+ if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+ t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
if (ret) {
@@ -4013,10 +4068,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
goto out_unlock_refcount;
}
- ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
- if (ret)
- mlog_errno(ret);
-
out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
@@ -4068,9 +4119,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
ret = ocfs2_reflink_xattrs(inode, old_bh,
new_inode, new_bh,
preserve);
- if (ret)
+ if (ret) {
mlog_errno(ret);
+ goto inode_unlock;
+ }
}
+
+ ret = ocfs2_complete_reflink(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret)
+ mlog_errno(ret);
+
inode_unlock:
ocfs2_inode_unlock(new_inode, 1);
brelse(new_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c0e48aeebb1..14f47d2bfe0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -773,18 +773,20 @@ static int ocfs2_sb_probe(struct super_block *sb,
if (tmpstat < 0) {
status = tmpstat;
mlog_errno(status);
- goto bail;
+ break;
}
di = (struct ocfs2_dinode *) (*bh)->b_data;
memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
spin_lock_init(&stats->b_lock);
- status = ocfs2_verify_volume(di, *bh, blksize, stats);
- if (status >= 0)
- goto bail;
- brelse(*bh);
- *bh = NULL;
- if (status != -EAGAIN)
+ tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+ if (tmpstat < 0) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ if (tmpstat != -EAGAIN) {
+ status = tmpstat;
break;
+ }
}
bail:
@@ -1645,6 +1647,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = numbits;
buf->f_ffree = freebits;
+ buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+ & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+ OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
brelse(bh);
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index b6284f235d2..c61369342a2 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,11 +53,6 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-#endif
#define MLOG_MASK_PREFIX ML_UPTODATE
diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c..b4b31d277f3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
error = -EPERM;
if (!capable(CAP_SYS_CHROOT))
goto dput_and_out;
+ error = security_path_chroot(&path);
+ if (error)
+ goto dput_and_out;
set_fs_root(current->fs, &path);
error = 0;
@@ -617,11 +620,15 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
if (err)
goto out_putf;
mutex_lock(&inode->i_mutex);
+ err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+ if (err)
+ goto out_unlock;
if (mode == (mode_t) -1)
mode = inode->i_mode;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
err = notify_change(dentry, &newattrs);
+out_unlock:
mutex_unlock(&inode->i_mutex);
mnt_drop_write(file->f_path.mnt);
out_putf:
@@ -646,11 +653,15 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
if (error)
goto dput_and_out;
mutex_lock(&inode->i_mutex);
+ error = security_path_chmod(path.dentry, path.mnt, mode);
+ if (error)
+ goto out_unlock;
if (mode == (mode_t) -1)
mode = inode->i_mode;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
error = notify_change(path.dentry, &newattrs);
+out_unlock:
mutex_unlock(&inode->i_mutex);
mnt_drop_write(path.mnt);
dput_and_out:
@@ -664,9 +675,9 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
return sys_fchmodat(AT_FDCWD, filename, mode);
}
-static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
+static int chown_common(struct path *path, uid_t user, gid_t group)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = path->dentry->d_inode;
int error;
struct iattr newattrs;
@@ -683,7 +694,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
mutex_lock(&inode->i_mutex);
- error = notify_change(dentry, &newattrs);
+ error = security_path_chown(path, user, group);
+ if (!error)
+ error = notify_change(path->dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
return error;
@@ -700,7 +713,7 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(path.dentry, user, group);
+ error = chown_common(&path, user, group);
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
@@ -725,7 +738,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(path.dentry, user, group);
+ error = chown_common(&path, user, group);
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
@@ -744,7 +757,7 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(path.dentry, user, group);
+ error = chown_common(&path, user, group);
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
@@ -767,7 +780,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
goto out_fput;
dentry = file->f_path.dentry;
audit_inode(NULL, dentry);
- error = chown_common(dentry, user, group);
+ error = chown_common(&file->f_path, user, group);
mnt_drop_write(file->f_path.mnt);
out_fput:
fput(file);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f38fee0311a..7b685e10cba 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
part_stat_read(p, merges[WRITE]),
(unsigned long long)part_stat_read(p, sectors[WRITE]),
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
- p->in_flight,
+ part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
jiffies_to_msecs(part_stat_read(p, time_in_queue)));
}
+ssize_t part_inflight_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_stat.attr,
+ &dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
diff --git a/fs/pipe.c b/fs/pipe.c
index 52c41511483..ae17d026aaa 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
static int
pipe_read_open(struct inode *inode, struct file *filp)
{
- /* We could have perhaps used atomic_t, but this and friends
- below are the only places. So it doesn't seem worthwhile. */
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- inode->i_pipe->readers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ inode->i_pipe->readers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
static int
pipe_write_open(struct inode *inode, struct file *filp)
{
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- inode->i_pipe->writers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ inode->i_pipe->writers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
static int
pipe_rdwr_open(struct inode *inode, struct file *filp)
{
+ int ret = -ENOENT;
+
mutex_lock(&inode->i_mutex);
- if (filp->f_mode & FMODE_READ)
- inode->i_pipe->readers++;
- if (filp->f_mode & FMODE_WRITE)
- inode->i_pipe->writers++;
+
+ if (inode->i_pipe) {
+ ret = 0;
+ if (filp->f_mode & FMODE_READ)
+ inode->i_pipe->readers++;
+ if (filp->f_mode & FMODE_WRITE)
+ inode->i_pipe->writers++;
+ }
+
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
/*
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07f77a7945c..822c2d50651 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -571,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
rsslim,
mm ? mm->start_code : 0,
mm ? mm->end_code : 0,
- (permitted) ? task->stack_start : 0,
+ (permitted && mm) ? task->stack_start : 0,
esp,
eip,
/* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 837469a9659..af643b5aefe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
name.len = snprintf(buf, sizeof(buf), "%d", pid);
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
- if (!(current->flags & PF_EXITING))
- shrink_dcache_parent(dentry);
+ shrink_dcache_parent(dentry);
d_drop(dentry);
dput(dentry);
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c7bff4f603f..a65239cfd97 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"VmallocUsed: %8lu kB\n"
"VmallocChunk: %8lu kB\n"
#ifdef CONFIG_MEMORY_FAILURE
- "HardwareCorrupted: %8lu kB\n"
+ "HardwareCorrupted: %5lu kB\n"
#endif
,
K(i.totalram),
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46..353e78a9ebe 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
config QUOTA_NETLINK_INTERFACE
bool "Report quota messages through netlink interface"
- depends on QUOTA && NET
+ depends on QUOTACTL && NET
help
If you say Y here, quota warnings (about exceeding softlimit, reaching
hardlimit, etc.) will be reported through netlink interface. If unsure,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7e..9b6ad908dcb 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
#include <asm/uaccess.h>
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
}
#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "VFS_DQUOT",
- .version = 1,
- .maxattr = QUOTA_NL_A_MAX,
-};
-
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
- static atomic_t seq;
- struct sk_buff *skb;
- void *msg_head;
- int ret;
- int msg_size = 4 * nla_total_size(sizeof(u32)) +
- 2 * nla_total_size(sizeof(u64));
-
- /* We have to allocate using GFP_NOFS as we are called from a
- * filesystem performing write and thus further recursion into
- * the fs to free some data could cause deadlocks. */
- skb = genlmsg_new(msg_size, GFP_NOFS);
- if (!skb) {
- printk(KERN_ERR
- "VFS: Not enough memory to send quota warning.\n");
- return;
- }
- msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
- &quota_genl_family, 0, QUOTA_NL_C_WARNING);
- if (!msg_head) {
- printk(KERN_ERR
- "VFS: Cannot store netlink header in quota warning.\n");
- goto err_out;
- }
- ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
- MAJOR(dquot->dq_sb->s_dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
- MINOR(dquot->dq_sb->s_dev));
- if (ret)
- goto attr_err_out;
- ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
- if (ret)
- goto attr_err_out;
- genlmsg_end(skb, msg_head);
-
- genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
- return;
-attr_err_out:
- printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
- kfree_skb(skb);
-}
-#endif
/*
* Write warnings to the console and send warning messages over netlink.
*
@@ -1145,18 +1074,20 @@ err_out:
*/
static void flush_warnings(struct dquot *const *dquots, char *warntype)
{
+ struct dquot *dq;
int i;
- for (i = 0; i < MAXQUOTAS; i++)
- if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
- !warning_issued(dquots[i], warntype[i])) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dq = dquots[i];
+ if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+ !warning_issued(dq, warntype[i])) {
#ifdef CONFIG_PRINT_QUOTA_WARNING
- print_warning(dquots[i], warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
- send_warning(dquots[i], warntype[i]);
+ print_warning(dq, warntype[i]);
#endif
+ quota_send_warning(dq->dq_type, dq->dq_id,
+ dq->dq_sb->s_dev, warntype[i]);
}
+ }
}
static int ignore_hardlimit(struct dquot *dquot)
@@ -2607,12 +2538,6 @@ static int __init dquot_init(void)
register_shrinker(&dqcache_shrinker);
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
- if (genl_register_family(&quota_genl_family) != 0)
- printk(KERN_ERR
- "VFS: Failed to create quota netlink interface.\n");
-#endif
-
return 0;
}
module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b..ee91e275695 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
/* Check validity of generic quotactl commands */
static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
return ret;
}
#endif
+
+
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "VFS_DQUOT",
+ .version = 1,
+ .maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+ const char warntype)
+{
+ static atomic_t seq;
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+ int msg_size = 4 * nla_total_size(sizeof(u32)) +
+ 2 * nla_total_size(sizeof(u64));
+
+ /* We have to allocate using GFP_NOFS as we are called from a
+ * filesystem performing write and thus further recursion into
+ * the fs to free some data could cause deadlocks. */
+ skb = genlmsg_new(msg_size, GFP_NOFS);
+ if (!skb) {
+ printk(KERN_ERR
+ "VFS: Not enough memory to send quota warning.\n");
+ return;
+ }
+ msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+ &quota_genl_family, 0, QUOTA_NL_C_WARNING);
+ if (!msg_head) {
+ printk(KERN_ERR
+ "VFS: Cannot store netlink header in quota warning.\n");
+ goto err_out;
+ }
+ ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+ if (ret)
+ goto attr_err_out;
+ ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+ if (ret)
+ goto attr_err_out;
+ genlmsg_end(skb, msg_head);
+
+ genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+ return;
+attr_err_out:
+ printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+ if (genl_register_family(&quota_genl_family) != 0)
+ printk(KERN_ERR
+ "VFS: Failed to create quota netlink interface.\n");
+ return 0;
+};
+
+module_init(quota_init);
+#endif
+
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208adf8e7..71e2b4d50a0 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd)
- return romfs_mtd_strnlen(sb, pos, limit);
+ return romfs_mtd_strnlen(sb, pos, maxlen);
#endif
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev)
- return romfs_blk_strnlen(sb, pos, limit);
+ return romfs_blk_strnlen(sb, pos, maxlen);
#endif
return -EIO;
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0050fc40e8c..e0201837d24 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,6 +21,7 @@
#include <linux/completion.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "sysfs.h"
DEFINE_MUTEX(sysfs_mutex);
@@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
sysfs_put(sd->s_symlink.target_sd);
if (sysfs_type(sd) & SYSFS_COPY_NAME)
kfree(sd->s_name);
+ if (sd->s_iattr && sd->s_iattr->ia_secdata)
+ security_release_secctx(sd->s_iattr->ia_secdata,
+ sd->s_iattr->ia_secdata_len);
kfree(sd->s_iattr);
sysfs_free_ino(sd->s_ino);
kmem_cache_free(sysfs_dir_cachep, sd);
@@ -894,7 +898,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
mutex_lock(&sysfs_rename_mutex);
BUG_ON(!sd->s_parent);
- new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
+ new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+ new_parent_kobj->sd : &sysfs_root;
error = 0;
if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c050ce..f5ea4680f15 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
struct sysfs_open_dirent *od, *new_od = NULL;
retry:
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irq(&sysfs_open_dirent_lock);
if (!sd->s_attr.open && new_od) {
sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
list_add_tail(&buffer->list, &od->buffers);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irq(&sysfs_open_dirent_lock);
if (od) {
kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
struct sysfs_buffer *buffer)
{
struct sysfs_open_dirent *od = sd->s_attr.open;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
list_del(&buffer->list);
if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
else
od = NULL;
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
kfree(od);
}
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
void sysfs_notify_dirent(struct sysfs_dirent *sd)
{
struct sysfs_open_dirent *od;
+ unsigned long flags;
- spin_lock(&sysfs_open_dirent_lock);
+ spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
od = sd->s_attr.open;
if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
wake_up_interruptible(&od->poll);
}
- spin_unlock(&sysfs_open_dirent_lock);
+ spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
}
EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index c6ad7c7e3ee..05ac0fe9c4d 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -36,7 +36,7 @@ posix_acl_from_xattr(const void *value, size_t size)
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
acl_e = acl->a_entries;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 381854461b2..c2e30eea74d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
}
/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+ xfs_ioend_t *ioend)
+{
+ xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ xfs_fsize_t isize;
+ xfs_fsize_t bsize;
+
+ bsize = ioend->io_offset + ioend->io_size;
+ isize = MAX(ip->i_size, ip->i_new_size);
+ isize = MIN(isize, bsize);
+ return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
* Update on-disk file size now that data has been written to disk.
* The current in-memory file size is i_size. If a write is beyond
* eof i_new_size will be the intended file size until i_size is
* updated. If this write does not extend all the way to the valid
* file size then restrict this update to the end of the write.
*/
+
STATIC void
xfs_setfilesize(
xfs_ioend_t *ioend)
{
xfs_inode_t *ip = XFS_I(ioend->io_inode);
xfs_fsize_t isize;
- xfs_fsize_t bsize;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,16 +224,10 @@ xfs_setfilesize(
if (unlikely(ioend->io_error))
return;
- bsize = ioend->io_offset + ioend->io_size;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
-
- if (ip->i_d.di_size < isize) {
+ isize = xfs_ioend_new_eof(ioend);
+ if (isize) {
ip->i_d.di_size = isize;
- ip->i_update_core = 1;
xfs_mark_inode_dirty_sync(ip);
}
@@ -404,10 +416,16 @@ xfs_submit_ioend_bio(
struct bio *bio)
{
atomic_inc(&ioend->io_remaining);
-
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
+ /*
+ * If the I/O is beyond EOF we mark the inode dirty immediately
+ * but don't update the inode size until I/O completion.
+ */
+ if (xfs_ioend_new_eof(ioend))
+ xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+
submit_bio(WRITE, bio);
ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
bio_put(bio);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 629370974e5..eff61e2732a 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -176,14 +176,7 @@ xfs_file_fsync(
struct dentry *dentry,
int datasync)
{
- struct inode *inode = dentry->d_inode;
- struct xfs_inode *ip = XFS_I(inode);
- int error;
-
- /* capture size updates in I/O completion before writing the inode. */
- error = filemap_fdatawait(inode->i_mapping);
- if (error)
- return error;
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
xfs_iflags_clear(ip, XFS_ITRUNCATED);
return -xfs_fsync(ip);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da0159d99f8..cd42ef78f6b 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,19 +57,22 @@
#include <linux/fiemap.h>
/*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
*/
void
-xfs_synchronize_atime(
+xfs_synchronize_times(
xfs_inode_t *ip)
{
struct inode *inode = VFS_I(ip);
- if (!(inode->i_state & I_CLEAR)) {
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- }
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
}
/*
@@ -106,32 +109,20 @@ xfs_ichgtime(
if ((flags & XFS_ICHGTIME_MOD) &&
!timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
if ((flags & XFS_ICHGTIME_CHG) &&
!timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
sync_it = 1;
}
/*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
+ * Update complete - now make sure everyone knows that the inode
+ * is dirty.
*/
- if (sync_it) {
- SYNCHRONIZE();
- ip->i_update_core = 1;
+ if (sync_it)
xfs_mark_inode_dirty_sync(ip);
- }
}
/*
@@ -506,10 +497,8 @@ xfs_vn_getattr(
stat->gid = ip->i_d.di_gid;
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 49e4a6aea73..072050f8d34 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
xip->i_new_size = new_size;
if (likely(!(ioflags & IO_INVIS)))
- xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ file_update_time(file);
/*
* If the offset is beyond the size of the file, we have a couple
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9e41f91aa26..3d4a0c84d63 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -80,7 +80,7 @@ xfs_fs_set_xstate(
if (sb->s_flags & MS_RDONLY)
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bdd41c8c342..18a4b8e11df 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -977,6 +977,28 @@ xfs_fs_inode_init_once(
}
/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode. Care must be taken
+ * here - the transaction code calls mark_inode_dirty_sync() to mark the
+ * VFS inode dirty in a transaction and clears the i_update_core field;
+ * it must clear the field after calling mark_inode_dirty_sync() to
+ * correctly indicate that the dirty state has been propagated into the
+ * inode log item.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+ struct inode *inode)
+{
+ barrier();
+ XFS_I(inode)->i_update_core = 1;
+}
+
+/*
* Attempt to flush the inode, this will actually fail
* if the inode is pinned, but we dirty the inode again
* at the point when it is unpinned after a log write,
@@ -1126,7 +1148,7 @@ xfs_fs_put_super(
}
STATIC int
-xfs_fs_sync_super(
+xfs_fs_sync_fs(
struct super_block *sb,
int wait)
{
@@ -1134,23 +1156,23 @@ xfs_fs_sync_super(
int error;
/*
- * Treat a sync operation like a freeze. This is to work
- * around a race in sync_inodes() which works in two phases
- * - an asynchronous flush, which can write out an inode
- * without waiting for file size updates to complete, and a
- * synchronous flush, which wont do anything because the
- * async flush removed the inode's dirty flag. Also
- * sync_inodes() will not see any files that just have
- * outstanding transactions to be flushed because we don't
- * dirty the Linux inode until after the transaction I/O
- * completes.
+ * Not much we can do for the first async pass. Writing out the
+ * superblock would be counter-productive as we are going to redirty
+ * when writing out other data and metadata (and writing out a single
+ * block is quite fast anyway).
+ *
+ * Try to asynchronously kick off quota syncing at least.
*/
- if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
- error = xfs_quiesce_data(mp);
- else
- error = xfs_sync_fsdata(mp, 0);
+ if (!wait) {
+ xfs_qm_sync(mp, SYNC_TRYLOCK);
+ return 0;
+ }
+
+ error = xfs_quiesce_data(mp);
+ if (error)
+ return -error;
- if (unlikely(laptop_mode)) {
+ if (laptop_mode) {
int prev_sync_seq = mp->m_sync_seq;
/*
@@ -1169,7 +1191,7 @@ xfs_fs_sync_super(
mp->m_sync_seq != prev_sync_seq);
}
- return -error;
+ return 0;
}
STATIC int
@@ -1539,10 +1561,11 @@ xfs_fs_get_sb(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
.write_inode = xfs_fs_write_inode,
.clear_inode = xfs_fs_clear_inode,
.put_super = xfs_fs_put_super,
- .sync_fs = xfs_fs_sync_super,
+ .sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
.statfs = xfs_fs_statfs,
.remount_fs = xfs_fs_remount,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 320be6aea49..961df0a22c7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
STATIC int
xfs_commit_dummy_trans(
struct xfs_mount *mp,
- uint log_flags)
+ uint flags)
{
struct xfs_inode *ip = mp->m_rootip;
struct xfs_trans *tp;
int error;
+ int log_flags = XFS_LOG_FORCE;
+
+ if (flags & SYNC_WAIT)
+ log_flags |= XFS_LOG_SYNC;
/*
* Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /* XXX(hch): ignoring the error here.. */
error = xfs_trans_commit(tp, 0);
-
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* the log force ensures this transaction is pushed to disk */
xfs_log_force(mp, 0, log_flags);
- return 0;
+ return error;
}
int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
else
XFS_BUF_ASYNC(bp);
- return xfs_bwrite(mp, bp);
+ error = xfs_bwrite(mp, bp);
+ if (error)
+ return error;
+
+ /*
+ * If this is a data integrity sync make sure all pending buffers
+ * are flushed out for the log coverage check below.
+ */
+ if (flags & SYNC_WAIT)
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+ if (xfs_log_need_covered(mp))
+ error = xfs_commit_dummy_trans(mp, flags);
+ return error;
out_brelse:
xfs_buf_relse(bp);
@@ -419,14 +435,16 @@ xfs_quiesce_data(
/* push non-blocking */
xfs_sync_data(mp, 0);
xfs_qm_sync(mp, SYNC_TRYLOCK);
- xfs_filestream_flush(mp);
- /* push and block */
+ /* push and block till complete */
xfs_sync_data(mp, SYNC_WAIT);
xfs_qm_sync(mp, SYNC_WAIT);
+ /* drop inode references pinned by filestreams */
+ xfs_filestream_flush(mp);
+
/* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp, 0);
+ error = xfs_sync_fsdata(mp, SYNC_WAIT);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -570,8 +588,6 @@ xfs_sync_worker(
/* dgc: errors ignored here */
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
- if (xfs_log_need_covered(mp))
- error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 4e4276b956e..5d1a3b98a6e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -876,7 +876,6 @@ xfs_dqrele_inode(
ip->i_gdquot = NULL;
}
xfs_iput(ip, XFS_ILOCK_EXCL);
- IRELE(ip);
return 0;
}
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9ee125..ab89a7e94a0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
- (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
- (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
- (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = XFS_ERROR(EBUSY);
goto out_unlock;
}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e45944..41ad537c49e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
*/
ra_want = howmany(bufsize + mp->m_dirblksize,
mp->m_sb.sb_blocksize) - 1;
+ ASSERT(ra_want >= 0);
/*
* If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
*/
ptr += length;
curoff += length;
- bufsize -= length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
}
/*
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab64f3efb43..0785797db82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -880,6 +880,7 @@ nextag:
* Not in range - save last search
* location and allocate a new inode
*/
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1dc7ef5a1d..b92a4fa2a0a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
SYNCHRONIZE();
/*
- * Make sure to get the latest atime from the Linux inode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- xfs_synchronize_atime(ip);
+ xfs_synchronize_times(ip);
if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0b38b9a869e..41555de1d1d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
-void xfs_synchronize_atime(xfs_inode_t *);
+void xfs_synchronize_times(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 47d5b663c37..9794b876d6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
nvecs = 1;
/*
+ * Make sure the linux inode is dirty. We do this before
+ * clearing i_update_core as the VFS will call back into
+ * XFS here and set i_update_core, so we need to dirty the
+ * inode first so that the ordering of i_update_core and
+ * unlogged modifications still works as described below.
+ */
+ xfs_mark_inode_dirty_sync(ip);
+
+ /*
* Clear i_update_core if the timestamps (or any other
* non-transactional modification) need flushing/logging
* and we're about to log them with the rest of the core.
@@ -263,14 +272,9 @@ xfs_inode_item_format(
}
/*
- * Make sure to get the latest atime from the Linux inode.
+ * Make sure to get the latest timestamps from the Linux inode.
*/
- xfs_synchronize_atime(ip);
-
- /*
- * make sure the linux inode is dirty
- */
- xfs_mark_inode_dirty_sync(ip);
+ xfs_synchronize_times(ip);
vecp->i_addr = (xfs_caddr_t)&ip->i_d;
vecp->i_len = sizeof(struct xfs_icdinode);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b68f9107e26..62efab2f383 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
{
xfs_icdinode_t *dic; /* dinode core info pointer */
xfs_inode_t *ip; /* incore inode pointer */
+ struct inode *inode;
int error;
error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
+ inode = VFS_I(ip);
/* xfs_iget returns the following without needing
* further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
+
/*
- * We are reading the atime from the Linux inode because the
- * dinode might not be uptodate.
+ * We need to read the timestamps from the Linux inode because
+ * the VFS keeps writing directly into the inode structure instead
+ * of telling us about the updates.
*/
- buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
- buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
- buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
- buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1099395d7d6..fb17f8226b0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer(
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
+ if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
cmn_err(CE_ALERT,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[i].i_len, __func__);
@@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans(
"XFS: NULL dquot in %s.", __func__);
return XFS_ERROR(EIO);
}
- if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
+ if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
cmn_err(CE_ALERT,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f31271c30de..2ffc570679b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -467,6 +467,7 @@ xfs_trans_ail_update(
{
xfs_log_item_t *dlip = NULL;
xfs_log_item_t *mlip; /* ptr to minimum lip */
+ xfs_lsn_t tail_lsn;
mlip = xfs_ail_min(ailp);
@@ -483,8 +484,16 @@ xfs_trans_ail_update(
if (mlip == dlip) {
mlip = xfs_ail_min(ailp);
+ /*
+ * It is not safe to access mlip after the AIL lock is
+ * dropped, so we must get a copy of li_lsn before we do
+ * so. This is especially important on 32-bit platforms
+ * where accessing and updating 64-bit values like li_lsn
+ * is not atomic.
+ */
+ tail_lsn = mlip->li_lsn;
spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
} else {
spin_unlock(&ailp->xa_lock);
}
@@ -514,6 +523,7 @@ xfs_trans_ail_delete(
{
xfs_log_item_t *dlip;
xfs_log_item_t *mlip;
+ xfs_lsn_t tail_lsn;
if (lip->li_flags & XFS_LI_IN_AIL) {
mlip = xfs_ail_min(ailp);
@@ -527,9 +537,16 @@ xfs_trans_ail_delete(
if (mlip == dlip) {
mlip = xfs_ail_min(ailp);
+ /*
+ * It is not safe to access mlip after the AIL lock
+ * is dropped, so we must get a copy of li_lsn
+ * before we do so. This is especially important
+ * on 32-bit platforms where accessing and updating
+ * 64-bit values like li_lsn is not atomic.
+ */
+ tail_lsn = mlip ? mlip->li_lsn : 0;
spin_unlock(&ailp->xa_lock);
- xfs_log_move_tail(ailp->xa_mount,
- (mlip ? mlip->li_lsn : 0));
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
} else {
spin_unlock(&ailp->xa_lock);
}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a434f287962..b572f7e840e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2476,12 +2476,6 @@ xfs_reclaim(
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
/*
- * Make sure the atime in the XFS inode is correct before freeing the
- * Linux inode.
- */
- xfs_synchronize_atime(ip);
-
- /*
* If we have nothing to flush with this inode then complete the
* teardown now, otherwise break the link between the xfs inode and the
* linux inode and clean up the xfs inode later. This avoids flushing