aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-01-29 09:24:57 +0100
committerIngo Molnar <mingo@elte.hu>2010-01-29 10:36:22 +0100
commitae7f6711d6231c9ba54feb5ba9856c3775e482f8 (patch)
tree89070c82204b2503348e4fd6c51d25a169375545 /fs
parent64abebf731df87e6f4ae7d9ffc340bdf0c033e44 (diff)
parentb23ff0e9330e4b11e18af984d50573598e10e7f9 (diff)
Merge branch 'perf/urgent' into perf/core
Merge reason: We want to queue up a dependent patch. Also update to later -rc's. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c41
-rw-r--r--fs/bio.c2
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/extent-tree.c32
-rw-r--r--fs/btrfs/file.c100
-rw-r--r--fs/btrfs/inode.c12
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/cifs/cifs_dfs_ref.c3
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/symlink.c4
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/ecryptfs/file.c17
-rw-r--r--fs/ecryptfs/inode.c158
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/eventfd.c89
-rw-r--r--fs/ext4/ext4.h9
-rw-r--r--fs/ext4/extents.c21
-rw-r--r--fs/ext4/inode.c82
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/hppfs/hppfs.c18
-rw-r--r--fs/namei.c23
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/ramfs/file-nommu.c26
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c183
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c2
-rw-r--r--fs/xfs/xfs_dfrag.c106
-rw-r--r--fs/xfs/xfs_iget.c1
-rw-r--r--fs/xfs/xfs_inode.c21
-rw-r--r--fs/xfs/xfs_rtalloc.c2
36 files changed, 715 insertions, 415 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 18f74ec4dce..9d03d1ebca6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1001,44 +1001,6 @@ done:
}
/**
- * v9fs_vfs_readlink - read a symlink's location
- * @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
- */
-
-static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
- int buflen)
-{
- int retval;
- int ret;
- char *link = __getname();
-
- if (unlikely(!link))
- return -ENOMEM;
-
- if (buflen > PATH_MAX)
- buflen = PATH_MAX;
-
- P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
- dentry);
-
- retval = v9fs_readlink(dentry, link, buflen);
-
- if (retval > 0) {
- if ((ret = copy_to_user(buffer, link, retval)) != 0) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "problem copying to user: %d\n", ret);
- retval = ret;
- }
- }
-
- __putname(link);
- return retval;
-}
-
-/**
* v9fs_vfs_follow_link - follow a symlink path
* @dentry: dentry for symlink
* @nd: nameidata
@@ -1230,7 +1192,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
.rmdir = v9fs_vfs_rmdir,
.mknod = v9fs_vfs_mknod,
.rename = v9fs_vfs_rename,
- .readlink = v9fs_vfs_readlink,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
@@ -1253,7 +1214,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
};
static const struct inode_operations v9fs_symlink_inode_operations = {
- .readlink = v9fs_vfs_readlink,
+ .readlink = generic_readlink,
.follow_link = v9fs_vfs_follow_link,
.put_link = v9fs_vfs_put_link,
.getattr = v9fs_vfs_getattr,
diff --git a/fs/bio.c b/fs/bio.c
index 76e6713abf9..12429c9553e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
i = 0;
while (i < bio_slab_nr) {
- struct bio_slab *bslab = &bio_slabs[i];
+ bslab = &bio_slabs[i];
if (!bslab->slab && entry == -1)
entry = i;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2e9e69987a8..54f4798ab46 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -112,12 +112,14 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
mode = inode->i_mode;
- ret = posix_acl_equiv_mode(acl, &mode);
- if (ret < 0)
- return ret;
- ret = 0;
- inode->i_mode = mode;
name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ inode->i_mode = mode;
+ }
+ ret = 0;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56e50137d0e..432a2da4641 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+ atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (atomic_dec_and_test(&cache->count))
+ kfree(cache);
+}
+
/*
* this adds the block group to the fs_info rb tree for the block group
* cache
@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
}
}
if (ret)
- atomic_inc(&ret->count);
+ btrfs_get_block_group(ret);
spin_unlock(&info->block_group_cache_lock);
return ret;
@@ -407,6 +418,8 @@ err:
put_caching_control(caching_ctl);
atomic_dec(&block_group->space_info->caching_threads);
+ btrfs_put_block_group(block_group);
+
return 0;
}
@@ -447,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
up_write(&fs_info->extent_commit_sem);
atomic_inc(&cache->space_info->caching_threads);
+ btrfs_get_block_group(cache);
tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
cache->key.objectid);
@@ -486,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
return cache;
}
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
- if (atomic_dec_and_test(&cache->count))
- kfree(cache);
-}
-
static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
u64 flags)
{
@@ -2582,7 +2590,7 @@ next_block_group(struct btrfs_root *root,
if (node) {
cache = rb_entry(node, struct btrfs_block_group_cache,
cache_node);
- atomic_inc(&cache->count);
+ btrfs_get_block_group(cache);
} else
cache = NULL;
spin_unlock(&root->fs_info->block_group_cache_lock);
@@ -4227,7 +4235,7 @@ search:
u64 offset;
int cached;
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
search_start = block_group->key.objectid;
have_block_group:
@@ -4315,7 +4323,7 @@ have_block_group:
btrfs_put_block_group(block_group);
block_group = last_ptr->block_group;
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&last_ptr->lock);
spin_unlock(&last_ptr->refill_lock);
@@ -7395,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
-
- WARN_ON(atomic_read(&block_group->count) != 1);
- kfree(block_group);
+ btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index feaa13b105d..c02033596f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -506,7 +506,8 @@ next_slot:
}
static int extent_mergeable(struct extent_buffer *leaf, int slot,
- u64 objectid, u64 bytenr, u64 *start, u64 *end)
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
@@ -522,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
@@ -561,6 +563,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
u64 split;
int del_nr = 0;
int del_slot = 0;
+ int recow;
int ret;
btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -568,6 +571,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
again:
+ recow = 0;
split = start;
key.objectid = inode->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -591,12 +595,60 @@ again:
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
+
+ if (start == key.offset && end < extent_end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ if (start > key.offset && end == extent_end) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
while (start > key.offset || end < extent_end) {
if (key.offset == start)
split = end;
- memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = split;
ret = btrfs_duplicate_item(trans, root, path, &new_key);
if (ret == -EAGAIN) {
@@ -631,15 +683,18 @@ again:
path->slots[0]--;
extent_end = end;
}
+ recow = 1;
}
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
other_start = end;
other_end = 0;
- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
- bytenr, &other_start, &other_end)) {
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
@@ -650,8 +705,13 @@ again:
}
other_start = 0;
other_end = start;
- if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
- bytenr, &other_start, &other_end)) {
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ inode->i_ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
@@ -660,22 +720,22 @@ again:
inode->i_ino, orig_offset);
BUG_ON(ret);
}
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
if (del_nr == 0) {
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_mark_buffer_dirty(leaf);
- goto out;
- }
-
- fi = btrfs_item_ptr(leaf, del_slot - 1,
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - key.offset);
- btrfs_mark_buffer_dirty(leaf);
+ } else {
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
- ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- BUG_ON(ret);
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
+ }
out:
btrfs_free_path(path);
return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5440bab2363..b330e27c2d8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3796,6 +3796,12 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(dir->i_sb, &location, root);
+ if (unlikely(root->clean_orphans) &&
+ !(inode->i_sb->s_flags & MS_RDONLY)) {
+ down_read(&root->fs_info->cleanup_work_sem);
+ btrfs_orphan_cleanup(root);
+ up_read(&root->fs_info->cleanup_work_sem);
+ }
return inode;
}
@@ -3995,7 +4001,11 @@ skip:
/* Reached end of directory/root. Bump pos past the last item. */
if (key_type == BTRFS_DIR_INDEX_KEY)
- filp->f_pos = INT_LIMIT(off_t);
+ /*
+ * 32-bit glibc will use getdents64, but then strtol -
+ * so the last number we can serve is this.
+ */
+ filp->f_pos = 0x7fffffff;
else
filp->f_pos++;
nopos:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b10a49d4bc6..5c2a9e78a94 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -626,6 +626,8 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
if (ordered)
offset = entry_end(ordered);
+ else
+ offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
mutex_lock(&tree->mutex);
disk_i_size = BTRFS_I(inode)->disk_i_size;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a9728680eca..ed3e4a2ec2c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3281,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
return -ENOMEM;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ kfree(cluster);
return -ENOMEM;
+ }
rc->extents_found = 0;
rc->extents_skipped = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 198cff28766..220dad5db01 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2649,8 +2649,10 @@ again:
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
- if (!em && unplug_page)
+ if (!em && unplug_page) {
+ kfree(multi);
return 0;
+ }
if (!em) {
printk(KERN_CRIT "unable to find logical %llu len %llu\n",
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index fea9e898c4b..b44ce0a0711 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -269,7 +269,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
int err;
mntget(newmnt);
- err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
+ err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
switch (err) {
case 0:
path_put(&nd->path);
@@ -371,7 +371,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
if (IS_ERR(mnt))
goto out_err;
- nd->path.mnt->mnt_flags |= MNT_SHRINKABLE;
rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
out:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 332dd00f089..c5c45de1a2e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1005,6 +1005,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
#endif
+/* Big V (don't complain on serial console) */
+IGNORE_IOCTL(VT_OPENQRY)
+IGNORE_IOCTL(VT_GETMODE)
/* Little p (/dev/rtc, /dev/envctrl, etc.) */
COMPATIBLE_IOCTL(RTC_AIE_ON)
COMPATIBLE_IOCTL(RTC_AIE_OFF)
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8afa6b1d91..32a5f46b115 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -121,8 +121,10 @@ static int get_target(const char *symname, struct path *path,
ret = -ENOENT;
path_put(path);
}
- } else
+ } else {
ret = -EPERM;
+ path_put(path);
+ }
}
return ret;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed69..7cb0a59f4b9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
char *cipher_name, size_t *key_size)
{
char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
- char *full_alg_name;
+ char *full_alg_name = NULL;
int rc;
*key_tfm = NULL;
@@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
if (rc)
goto out;
*key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
- kfree(full_alg_name);
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
goto out;
}
out:
+ kfree(full_alg_name);
return rc;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001..678172b61be 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -158,7 +158,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
struct dentry *ecryptfs_dentry = file->f_path.dentry;
/* Private value of ecryptfs_dentry allocated in
* ecryptfs_lookup() */
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+ struct dentry *lower_dentry;
struct ecryptfs_file_info *file_info;
mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
- if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
- && !(file->f_flags & O_RDONLY)) {
- rc = -EPERM;
- printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
- "file must hence be opened RO\n", __func__);
- goto out;
- }
if (!ecryptfs_inode_to_private(inode)->lower_file) {
rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
if (rc) {
@@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
goto out;
}
}
+ if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+ && !(file->f_flags & O_RDONLY)) {
+ rc = -EPERM;
+ printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+ "file must hence be opened RO\n", __func__);
+ goto out;
+ }
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -299,7 +299,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file,
const struct file_operations ecryptfs_dir_fops = {
.readdir = ecryptfs_readdir,
.ioctl = ecryptfs_ioctl,
- .mmap = generic_file_mmap,
.open = ecryptfs_open,
.flush = ecryptfs_flush,
.release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 429ca0b3ba0..4a430ab4115 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -282,7 +282,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
goto out;
}
rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
- ecryptfs_dir_inode->i_sb, 1);
+ ecryptfs_dir_inode->i_sb,
+ ECRYPTFS_INTERPOSE_FLAG_D_ADD);
if (rc) {
printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
__func__, rc);
@@ -463,9 +464,6 @@ out_lock:
unlock_dir(lower_dir_dentry);
dput(lower_new_dentry);
dput(lower_old_dentry);
- d_drop(lower_old_dentry);
- d_drop(new_dentry);
- d_drop(old_dentry);
return rc;
}
@@ -614,6 +612,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
+ struct dentry *trap = NULL;
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -621,7 +620,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dget(lower_new_dentry);
lower_old_dir_dentry = dget_parent(lower_old_dentry);
lower_new_dir_dentry = dget_parent(lower_new_dentry);
- lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ /* source should not be ancestor of target */
+ if (trap == lower_old_dentry) {
+ rc = -EINVAL;
+ goto out_lock;
+ }
+ /* target should not be ancestor of source */
+ if (trap == lower_new_dentry) {
+ rc = -ENOTEMPTY;
+ goto out_lock;
+ }
rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
lower_new_dir_dentry->d_inode, lower_new_dentry);
if (rc)
@@ -715,31 +724,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
/* Released in ecryptfs_put_link(); only release here on error */
buf = kmalloc(len, GFP_KERNEL);
if (!buf) {
- rc = -ENOMEM;
+ buf = ERR_PTR(-ENOMEM);
goto out;
}
old_fs = get_fs();
set_fs(get_ds());
rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
set_fs(old_fs);
- if (rc < 0)
- goto out_free;
- else
+ if (rc < 0) {
+ kfree(buf);
+ buf = ERR_PTR(rc);
+ } else
buf[rc] = '\0';
- rc = 0;
- nd_set_link(nd, buf);
- goto out;
-out_free:
- kfree(buf);
out:
- return ERR_PTR(rc);
+ nd_set_link(nd, buf);
+ return NULL;
}
static void
ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
{
- /* Free the char* */
- kfree(nd_get_link(nd));
+ char *buf = nd_get_link(nd);
+ if (!IS_ERR(buf)) {
+ /* Free the char* */
+ kfree(buf);
+ }
}
/**
@@ -772,18 +781,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
}
/**
- * ecryptfs_truncate
+ * truncate_upper
* @dentry: The ecryptfs layer dentry
- * @new_length: The length to expand the file to
+ * @ia: Address of the ecryptfs inode's attributes
+ * @lower_ia: Address of the lower inode's attributes
*
* Function to handle truncations modifying the size of the file. Note
* that the file sizes are interpolated. When expanding, we are simply
- * writing strings of 0's out. When truncating, we need to modify the
- * underlying file size according to the page index interpolations.
+ * writing strings of 0's out. When truncating, we truncate the upper
+ * inode and update the lower_ia according to the page index
+ * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
+ * the caller must use lower_ia in a call to notify_change() to perform
+ * the truncation of the lower inode.
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+static int truncate_upper(struct dentry *dentry, struct iattr *ia,
+ struct iattr *lower_ia)
{
int rc = 0;
struct inode *inode = dentry->d_inode;
@@ -794,8 +808,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
loff_t lower_size_before_truncate;
loff_t lower_size_after_truncate;
- if (unlikely((new_length == i_size)))
+ if (unlikely((ia->ia_size == i_size))) {
+ lower_ia->ia_valid &= ~ATTR_SIZE;
goto out;
+ }
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
/* Set up a fake ecryptfs file, this is used to interface with
* the file in the underlying filesystem so that the
@@ -815,28 +831,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
&fake_ecryptfs_file,
ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
/* Switch on growing or shrinking file */
- if (new_length > i_size) {
+ if (ia->ia_size > i_size) {
char zero[] = { 0x00 };
+ lower_ia->ia_valid &= ~ATTR_SIZE;
/* Write a single 0 at the last position of the file;
* this triggers code that will fill in 0's throughout
* the intermediate portion of the previous end of the
* file and the new and of the file */
rc = ecryptfs_write(&fake_ecryptfs_file, zero,
- (new_length - 1), 1);
- } else { /* new_length < i_size_read(inode) */
- /* We're chopping off all the pages down do the page
- * in which new_length is located. Fill in the end of
- * that page from (new_length & ~PAGE_CACHE_MASK) to
+ (ia->ia_size - 1), 1);
+ } else { /* ia->ia_size < i_size_read(inode) */
+ /* We're chopping off all the pages down to the page
+ * in which ia->ia_size is located. Fill in the end of
+ * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
* PAGE_CACHE_SIZE with zeros. */
size_t num_zeros = (PAGE_CACHE_SIZE
- - (new_length & ~PAGE_CACHE_MASK));
+ - (ia->ia_size & ~PAGE_CACHE_MASK));
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = vmtruncate(inode, new_length);
+ rc = vmtruncate(inode, ia->ia_size);
if (rc)
goto out_free;
- rc = vmtruncate(lower_dentry->d_inode, new_length);
+ lower_ia->ia_size = ia->ia_size;
+ lower_ia->ia_valid |= ATTR_SIZE;
goto out_free;
}
if (num_zeros) {
@@ -848,7 +866,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
goto out_free;
}
rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
- new_length, num_zeros);
+ ia->ia_size, num_zeros);
kfree(zeros_virt);
if (rc) {
printk(KERN_ERR "Error attempting to zero out "
@@ -857,7 +875,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
goto out_free;
}
}
- vmtruncate(inode, new_length);
+ vmtruncate(inode, ia->ia_size);
rc = ecryptfs_write_inode_size_to_metadata(inode);
if (rc) {
printk(KERN_ERR "Problem with "
@@ -870,10 +888,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
lower_size_before_truncate =
upper_size_to_lower_size(crypt_stat, i_size);
lower_size_after_truncate =
- upper_size_to_lower_size(crypt_stat, new_length);
- if (lower_size_after_truncate < lower_size_before_truncate)
- vmtruncate(lower_dentry->d_inode,
- lower_size_after_truncate);
+ upper_size_to_lower_size(crypt_stat, ia->ia_size);
+ if (lower_size_after_truncate < lower_size_before_truncate) {
+ lower_ia->ia_size = lower_size_after_truncate;
+ lower_ia->ia_valid |= ATTR_SIZE;
+ } else
+ lower_ia->ia_valid &= ~ATTR_SIZE;
}
out_free:
if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -883,6 +903,33 @@ out:
return rc;
}
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Simple function that handles the truncation of an eCryptfs inode and
+ * its corresponding lower inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+ struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
+ struct iattr lower_ia = { .ia_valid = 0 };
+ int rc;
+
+ rc = truncate_upper(dentry, &ia, &lower_ia);
+ if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = notify_change(lower_dentry, &lower_ia);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ }
+ return rc;
+}
+
static int
ecryptfs_permission(struct inode *inode, int mask)
{
@@ -905,6 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
{
int rc = 0;
struct dentry *lower_dentry;
+ struct iattr lower_ia;
struct inode *inode;
struct inode *lower_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -943,15 +991,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
+ memcpy(&lower_ia, ia, sizeof(lower_ia));
+ if (ia->ia_valid & ATTR_FILE)
+ lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
if (ia->ia_valid & ATTR_SIZE) {
- ecryptfs_printk(KERN_DEBUG,
- "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
- ia->ia_valid, ATTR_SIZE);
- rc = ecryptfs_truncate(dentry, ia->ia_size);
- /* ecryptfs_truncate handles resizing of the lower file */
- ia->ia_valid &= ~ATTR_SIZE;
- ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
- ia->ia_valid);
+ rc = truncate_upper(dentry, ia, &lower_ia);
if (rc < 0)
goto out;
}
@@ -960,17 +1004,32 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
* mode change is for clearing setuid/setgid bits. Allow lower fs
* to interpret this in its own way.
*/
- if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
- ia->ia_valid &= ~ATTR_MODE;
+ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ lower_ia.ia_valid &= ~ATTR_MODE;
mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = notify_change(lower_dentry, ia);
+ rc = notify_change(lower_dentry, &lower_ia);
mutex_unlock(&lower_dentry->d_inode->i_mutex);
out:
fsstack_copy_attr_all(inode, lower_inode);
return rc;
}
+int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kstat lower_stat;
+ int rc;
+
+ rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
+ ecryptfs_dentry_to_lower(dentry), &lower_stat);
+ if (!rc) {
+ generic_fillattr(dentry->d_inode, stat);
+ stat->blocks = lower_stat.blocks;
+ }
+ return rc;
+}
+
int
ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
@@ -1100,6 +1159,7 @@ const struct inode_operations ecryptfs_dir_iops = {
const struct inode_operations ecryptfs_main_iops = {
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
+ .getattr = ecryptfs_getattr,
.setxattr = ecryptfs_setxattr,
.getxattr = ecryptfs_getxattr,
.listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 567bc4b9f70..ea2f92101df 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -585,8 +585,8 @@ out:
* with as much information as it can before needing
* the lower filesystem.
* ecryptfs_read_super(): this accesses the lower filesystem and uses
- * ecryptfs_interpolate to perform most of the linking
- * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ * ecryptfs_interpose to perform most of the linking
+ * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
*/
static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
const char *dev_name, void *raw_data,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d26402ff06e..7758cc382ef 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
return events;
}
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
+static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+ *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ ctx->count -= *cnt;
+}
+
+/**
+ * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
+ * @ctx: [in] Pointer to eventfd context.
+ * @wait: [in] Wait queue to be removed.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN : The operation would have blocked.
+ *
+ * This is used to atomically remove a wait queue entry from the eventfd wait
+ * queue head, and read/reset the counter value.
+ */
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+ __u64 *cnt)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->wqh.lock, flags);
+ eventfd_ctx_do_read(ctx, cnt);
+ __remove_wait_queue(&ctx->wqh, wait);
+ if (*cnt != 0 && waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, POLLOUT);
+ spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+ return *cnt != 0 ? 0 : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
+
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
{
- struct eventfd_ctx *ctx = file->private_data;
ssize_t res;
- __u64 ucnt = 0;
DECLARE_WAITQUEUE(wait, current);
- if (count < sizeof(ucnt))
- return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
+ *cnt = 0;
res = -EAGAIN;
if (ctx->count > 0)
- res = sizeof(ucnt);
- else if (!(file->f_flags & O_NONBLOCK)) {
+ res = 0;
+ else if (!no_wait) {
__add_wait_queue(&ctx->wqh, &wait);
- for (res = 0;;) {
+ for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (ctx->count > 0) {
- res = sizeof(ucnt);
+ res = 0;
break;
}
if (signal_pending(current)) {
@@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
__remove_wait_queue(&ctx->wqh, &wait);
__set_current_state(TASK_RUNNING);
}
- if (likely(res > 0)) {
- ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
- ctx->count -= ucnt;
+ if (likely(res == 0)) {
+ eventfd_ctx_do_read(ctx, cnt);
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, POLLOUT);
}
spin_unlock_irq(&ctx->wqh.lock);
- if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
- return -EFAULT;
return res;
}
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+ ssize_t res;
+ __u64 cnt;
+
+ if (count < sizeof(cnt))
+ return -EINVAL;
+ res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+ if (res < 0)
+ return res;
+
+ return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+}
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
loff_t *ppos)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af7b62699ea..874d169a193 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -361,14 +361,11 @@ struct ext4_new_group_data {
so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
- /* Call ext4_da_update_reserve_space() after successfully
- allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
/* caller is from the direct IO path, request to creation of an
unitialized extents if not allocated, split the uninitialized
extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO 0x0010
-#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO 0x0008
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
/* Convert extent to initialized after direct IO complete */
@@ -1443,6 +1440,8 @@ extern int ext4_block_truncate_page(handle_t *handle,
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d7b74e9468..765a4826b11 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3132,7 +3132,19 @@ out:
unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
newblock + max_blocks,
allocated - max_blocks);
+ allocated = max_blocks;
}
+
+ /*
+ * If we have done fallocate with the offset that is already
+ * delayed allocated, we would have block reservation
+ * and quota reservation done in the delayed write path.
+ * But fallocate would have already updated quota and block
+ * count for this offset. So cancel these reservation
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 0);
+
map_out:
set_buffer_mapped(bh_result);
out1:
@@ -3368,9 +3380,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
+ if (allocated > max_blocks)
+ allocated = max_blocks;
set_buffer_new(bh_result);
/*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 1);
+
+ /*
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an uninitialized extent.
*/
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c818972c830..e11952404e0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1053,11 +1053,12 @@ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
* Called with i_data_sem down, which is important since we can call
* ext4_discard_preallocations() from here.
*/
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- int mdb_free = 0;
+ int mdb_free = 0, allocated_meta_blocks = 0;
spin_lock(&ei->i_block_reservation_lock);
if (unlikely(used > ei->i_reserved_data_blocks)) {
@@ -1073,6 +1074,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ei->i_reserved_data_blocks -= used;
used += ei->i_allocated_meta_blocks;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ allocated_meta_blocks = ei->i_allocated_meta_blocks;
ei->i_allocated_meta_blocks = 0;
percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
@@ -1090,9 +1092,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
/* Update quota subsystem */
- vfs_dq_claim_block(inode, used);
- if (mdb_free)
- vfs_dq_release_reservation_block(inode, mdb_free);
+ if (quota_claim) {
+ vfs_dq_claim_block(inode, used);
+ if (mdb_free)
+ vfs_dq_release_reservation_block(inode, mdb_free);
+ } else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not update the quota for allocated blocks. But then
+ * converting an fallocate region to initialized region would
+ * have caused a metadata allocation. So claim quota for
+ * that
+ */
+ if (allocated_meta_blocks)
+ vfs_dq_claim_block(inode, allocated_meta_blocks);
+ vfs_dq_release_reservation_block(inode, mdb_free + used);
+ }
/*
* If we have done all the pending block allocations and if
@@ -1292,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
*/
EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
- }
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
+ }
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now.
- */
- if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
- ext4_da_update_reserve_space(inode, retval);
-
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
int ret = check_block_validity(inode, "file system "
@@ -1835,24 +1853,12 @@ repeat:
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, md_needed + 1)) {
- /*
- * We tend to badly over-estimate the amount of
- * metadata blocks which are needed, so if we have
- * reserved any metadata blocks, try to force out the
- * inode and see if we have any better luck.
- */
- if (md_reserved && retries++ <= 3)
- goto retry;
+ if (vfs_dq_reserve_block(inode, md_needed + 1))
return -EDQUOT;
- }
if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
vfs_dq_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- retry:
- if (md_reserved)
- write_inode_now(inode, (retries == 3));
yield();
goto repeat;
}
@@ -2213,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* variables are updated after the blocks have been allocated.
*/
new.b_state = 0;
- get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
&new, get_blocks_flags);
if (blks < 0) {
@@ -3032,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret, retries = 0;
+ int ret, retries = 0, quota_retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
@@ -3091,6 +3097,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+
+ if ((ret == -EDQUOT) &&
+ EXT4_I(inode)->i_reserved_meta_blocks &&
+ (quota_retries++ < 3)) {
+ /*
+ * Since we often over-estimate the number of meta
+ * data blocks required, we may sometimes get a
+ * spurios out of quota error even though there would
+ * be enough space once we write the data blocks and
+ * find out how many meta data blocks were _really_
+ * required. So try forcing the inode write to see if
+ * that helps.
+ */
+ write_inode_now(inode, (quota_retries == 3));
+ goto retry;
+ }
out:
return ret;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a6..5ef953e6f90 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -199,7 +199,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
- write_lock_irq(&filp->f_owner.lock);
+ unsigned long flags;
+
+ write_lock_irqsave(&filp->f_owner.lock, flags);
if (force || !filp->f_owner.pid) {
put_pid(filp->f_owner.pid);
filp->f_owner.pid = get_pid(pid);
@@ -211,7 +213,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
filp->f_owner.euid = cred->euid;
}
}
- write_unlock_irq(&filp->f_owner.lock);
+ write_unlock_irqrestore(&filp->f_owner.lock, flags);
}
int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@@ -618,60 +620,90 @@ static DEFINE_RWLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
/*
- * fasync_helper() is used by almost all character device drivers
- * to set up the fasync queue. It returns negative on error, 0 if it did
- * no changes and positive if it added/deleted the entry.
+ * Remove a fasync entry. If successfully removed, return
+ * positive and clear the FASYNC flag. If no entry exists,
+ * do nothing and return 0.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ *
+ * We always take the 'filp->f_lock', in since fasync_lock
+ * needs to be irq-safe.
*/
-int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
struct fasync_struct *fa, **fp;
- struct fasync_struct *new = NULL;
int result = 0;
- if (on) {
- new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
- if (!new)
- return -ENOMEM;
+ spin_lock(&filp->f_lock);
+ write_lock_irq(&fasync_lock);
+ for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+ if (fa->fa_file != filp)
+ continue;
+ *fp = fa->fa_next;
+ kmem_cache_free(fasync_cache, fa);
+ filp->f_flags &= ~FASYNC;
+ result = 1;
+ break;
}
+ write_unlock_irq(&fasync_lock);
+ spin_unlock(&filp->f_lock);
+ return result;
+}
+
+/*
+ * Add a fasync entry. Return negative on error, positive if
+ * added, and zero if did nothing but change an existing one.
+ *
+ * NOTE! It is very important that the FASYNC flag always
+ * match the state "is the filp on a fasync list".
+ */
+static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+{
+ struct fasync_struct *new, *fa, **fp;
+ int result = 0;
+
+ new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
- /*
- * We need to take f_lock first since it's not an IRQ-safe
- * lock.
- */
spin_lock(&filp->f_lock);
write_lock_irq(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
- if (fa->fa_file == filp) {
- if(on) {
- fa->fa_fd = fd;
- kmem_cache_free(fasync_cache, new);
- } else {
- *fp = fa->fa_next;
- kmem_cache_free(fasync_cache, fa);
- result = 1;
- }
- goto out;
- }
+ if (fa->fa_file != filp)
+ continue;
+ fa->fa_fd = fd;
+ kmem_cache_free(fasync_cache, new);
+ goto out;
}
- if (on) {
- new->magic = FASYNC_MAGIC;
- new->fa_file = filp;
- new->fa_fd = fd;
- new->fa_next = *fapp;
- *fapp = new;
- result = 1;
- }
+ new->magic = FASYNC_MAGIC;
+ new->fa_file = filp;
+ new->fa_fd = fd;
+ new->fa_next = *fapp;
+ *fapp = new;
+ result = 1;
+ filp->f_flags |= FASYNC;
+
out:
- if (on)
- filp->f_flags |= FASYNC;
- else
- filp->f_flags &= ~FASYNC;
write_unlock_irq(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
+/*
+ * fasync_helper() is used by almost all character device drivers
+ * to set up the fasync queue, and for regular files by the file
+ * lease code. It returns negative on error, 0 if it did no changes
+ * and positive if it added/deleted the entry.
+ */
+int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+ if (!on)
+ return fasync_remove_entry(filp, fapp);
+ return fasync_add_entry(fd, filp, fapp);
+}
+
EXPORT_SYMBOL(fasync_helper);
void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index a5089a6dd67..7239efc690d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = {
static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
int buflen)
{
- struct dentry *proc_dentry;
-
- proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+ struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
buflen);
}
static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- struct dentry *proc_dentry;
-
- proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+ struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
}
+static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
+ void *cookie)
+{
+ struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+
+ if (proc_dentry->d_inode->i_op->put_link)
+ proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
+}
+
static const struct inode_operations hppfs_dir_iops = {
.lookup = hppfs_lookup,
};
@@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = {
static const struct inode_operations hppfs_link_iops = {
.readlink = hppfs_readlink,
.follow_link = hppfs_follow_link,
+ .put_link = hppfs_put_link,
};
static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
diff --git a/fs/namei.c b/fs/namei.c
index b55440baf7a..94a5e60779f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -561,6 +561,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
dget(dentry);
}
mntget(path->mnt);
+ nd->last_type = LAST_BIND;
cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
error = PTR_ERR(cookie);
if (!IS_ERR(cookie)) {
@@ -1603,11 +1604,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
struct file *filp;
struct nameidata nd;
int error;
- struct path path, save;
+ struct path path;
struct dentry *dir;
int count = 0;
int will_truncate;
int flag = open_to_namei_flags(open_flag);
+ int force_reval = 0;
/*
* O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
@@ -1619,7 +1621,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
open_flag |= O_DSYNC;
if (!acc_mode)
- acc_mode = MAY_OPEN | ACC_MODE(flag);
+ acc_mode = MAY_OPEN | ACC_MODE(open_flag);
/* O_TRUNC implies we need access checks for write permissions */
if (flag & O_TRUNC)
@@ -1659,9 +1661,12 @@ struct file *do_filp_open(int dfd, const char *pathname,
/*
* Create - we need to know the parent.
*/
+reval:
error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
if (error)
return ERR_PTR(error);
+ if (force_reval)
+ nd.flags |= LOOKUP_REVAL;
error = path_walk(pathname, &nd);
if (error) {
if (nd.root.mnt)
@@ -1853,17 +1858,7 @@ do_link:
error = security_inode_follow_link(path.dentry, &nd);
if (error)
goto exit_dput;
- save = nd.path;
- path_get(&save);
error = __do_follow_link(&path, &nd);
- if (error == -ESTALE) {
- /* nd.path had been dropped */
- nd.path = save;
- path_get(&nd.path);
- nd.flags |= LOOKUP_REVAL;
- error = __do_follow_link(&path, &nd);
- }
- path_put(&save);
path_put(&path);
if (error) {
/* Does someone understand code flow here? Or it is only
@@ -1873,6 +1868,10 @@ do_link:
release_open_intent(&nd);
if (nd.root.mnt)
path_put(&nd.root);
+ if (error == -ESTALE && !force_reval) {
+ force_reval = 1;
+ goto reval;
+ }
return ERR_PTR(error);
}
nd.flags &= ~LOOKUP_PARENT;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7d70d63ceb2..c768f733c8d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -965,10 +965,12 @@ EXPORT_SYMBOL(may_umount_tree);
int may_umount(struct vfsmount *mnt)
{
int ret = 1;
+ down_read(&namespace_sem);
spin_lock(&vfsmount_lock);
if (propagate_mount_busy(mnt, 2))
ret = 0;
spin_unlock(&vfsmount_lock);
+ up_read(&namespace_sem);
return ret;
}
@@ -1352,12 +1354,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
if (err)
goto out_cleanup_ids;
+ spin_lock(&vfsmount_lock);
+
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
-
- spin_lock(&vfsmount_lock);
if (parent_path) {
detach_mnt(source_mnt, parent_path);
attach_mnt(source_mnt, path);
@@ -1534,8 +1536,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = change_mount_flags(path->mnt, flags);
else
err = do_remount_sb(sb, flags, data, 0);
- if (!err)
+ if (!err) {
+ spin_lock(&vfsmount_lock);
+ mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK;
path->mnt->mnt_flags = mnt_flags;
+ spin_unlock(&vfsmount_lock);
+ }
up_write(&sb->s_umount);
if (!err) {
security_sb_post_remount(path->mnt, flags, data);
@@ -1665,6 +1671,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
{
int err;
+ mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD);
+
down_write(&namespace_sem);
/* Something was mounted here while we slept */
while (d_mountpoint(path->dentry) &&
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e..1afb0a10229 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data)
if (warned)
return 0;
- warned = false;
+ warned = true;
entry = p;
ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 8271cf05c95..a94e8bd8eb1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -552,7 +552,7 @@ retry:
spin_lock(&group->inotify_data.idr_lock);
ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
- group->inotify_data.last_wd,
+ group->inotify_data.last_wd+1,
&tmp_ientry->wd);
spin_unlock(&group->inotify_data.idr_lock);
if (ret) {
@@ -632,7 +632,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
- group->inotify_data.last_wd = 1;
+ group->inotify_data.last_wd = 0;
group->inotify_data.user = user;
group->inotify_data.fa = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18d5cc62d8e..e42bbd843ed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1419,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
goto out;
error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
- nd->last_type = LAST_BIND;
out:
return ERR_PTR(error);
}
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2efc57173fd..1739a4aba25 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -123,30 +123,6 @@ add_error:
/*****************************************************************************/
/*
- * check that file shrinkage doesn't leave any VMAs dangling in midair
- */
-static int ramfs_nommu_check_mappings(struct inode *inode,
- size_t newsize, size_t size)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
-
- /* search for VMAs that fall within the dead zone */
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- newsize >> PAGE_SHIFT,
- (size + PAGE_SIZE - 1) >> PAGE_SHIFT
- ) {
- /* found one - only interested if it's shared out of the page
- * cache */
- if (vma->vm_flags & VM_SHARED)
- return -ETXTBSY; /* not quite true, but near enough */
- }
-
- return 0;
-}
-
-/*****************************************************************************/
-/*
*
*/
static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
@@ -164,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
/* check that a decrease in size doesn't cut off any shared mappings */
if (newsize < size) {
- ret = ramfs_nommu_check_mappings(inode, newsize, size);
+ ret = nommu_shrink_inode_mappings(inode, size, newsize);
if (ret < 0)
return ret;
}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 09783cc444a..77414db10dc 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
- * If we have nothing to flush with this inode then complete the
- * teardown now, otherwise delay the flush operation.
+ * We always use background reclaim here because even if the
+ * inode is clean, it still may be under IO and hence we have
+ * to take the flush lock. The background reclaim path handles
+ * this more efficiently than we can here, so simply let background
+ * reclaim tear down all inodes.
*/
- if (!xfs_inode_clean(ip)) {
- xfs_inode_set_reclaim_tag(ip);
- return;
- }
-
out_reclaim:
- xfs_ireclaim(ip);
+ xfs_inode_set_reclaim_tag(ip);
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6fed97a8cd3..1f5e4bb5e97 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
- read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
(void **)&ip, *first_index, 1, tag);
}
if (!nr_found)
- goto unlock;
+ return NULL;
/*
* Update the index for the next lookup. Catch overflows
@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
*/
*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- goto unlock;
-
+ return NULL;
return ip;
-
-unlock:
- read_unlock(&pag->pag_ici_lock);
- return NULL;
}
STATIC int
@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive)
{
struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index;
@@ -114,10 +109,20 @@ restart:
int error = 0;
xfs_inode_t *ip;
+ if (exclusive)
+ write_lock(&pag->pag_ici_lock);
+ else
+ read_lock(&pag->pag_ici_lock);
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
- if (!ip)
+ if (!ip) {
+ if (exclusive)
+ write_unlock(&pag->pag_ici_lock);
+ else
+ read_unlock(&pag->pag_ici_lock);
break;
+ }
+ /* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags);
if (error == EAGAIN) {
skipped++;
@@ -125,9 +130,8 @@ restart:
}
if (error)
last_error = error;
- /*
- * bail out if the filesystem is corrupted.
- */
+
+ /* bail out if the filesystem is corrupted. */
if (error == EFSCORRUPTED)
break;
@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int exclusive)
{
int error = 0;
int last_error = 0;
@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init)
continue;
- error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+ error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+ exclusive);
if (error) {
last_error = error;
if (error == EFSCORRUPTED)
@@ -174,30 +180,31 @@ xfs_sync_inode_valid(
struct xfs_perag *pag)
{
struct inode *inode = VFS_I(ip);
+ int error = EFSCORRUPTED;
/* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- read_unlock(&pag->pag_ici_lock);
- return EFSCORRUPTED;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_unlock;
- /*
- * If we can't get a reference on the inode, it must be in reclaim.
- * Leave it for the reclaim code to flush. Also avoid inodes that
- * haven't been fully initialised.
- */
- if (!igrab(inode)) {
- read_unlock(&pag->pag_ici_lock);
- return ENOENT;
- }
- read_unlock(&pag->pag_ici_lock);
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ error = ENOENT;
+ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock;
- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ goto out_unlock;
+
+ if (is_bad_inode(inode)) {
IRELE(ip);
- return ENOENT;
+ goto out_unlock;
}
- return 0;
+ /* inode is valid */
+ error = 0;
+out_unlock:
+ read_unlock(&pag->pag_ici_lock);
+ return error;
}
STATIC int
@@ -282,7 +289,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
if (error)
return XFS_ERROR(error);
@@ -304,7 +311,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
}
STATIC int
@@ -664,60 +671,6 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task);
}
-STATIC int
-xfs_reclaim_inode(
- xfs_inode_t *ip,
- int sync_mode)
-{
- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-
- /* The hash lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
- write_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- return -EAGAIN;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_put_perag(ip->i_mount, pag);
-
- /*
- * If the inode is still dirty, then flush it out. If the inode
- * is not in the AIL, then it will be OK to flush it delwri as
- * long as xfs_iflush() does not keep any references to the inode.
- * We leave that decision up to xfs_iflush() since it has the
- * knowledge of whether it's OK to simply do a delwri flush of
- * the inode or whether we need to wait until the inode is
- * pulled from the AIL.
- * We get the flush lock regardless, though, just to make sure
- * we don't free it while it is being flushed.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
-
- /*
- * In the case of a forced shutdown we rely on xfs_iflush() to
- * wait for the inode to be unpinned before returning an error.
- */
- if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
- /* synchronize with xfs_iflush_done */
- xfs_iflock(ip);
- xfs_ifunlock(ip);
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_ireclaim(ip);
- return 0;
-}
-
void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
@@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag(
}
STATIC int
-xfs_reclaim_inode_now(
+xfs_reclaim_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
- int flags)
+ int sync_mode)
{
- /* ignore if already under reclaim */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
return 0;
}
- read_unlock(&pag->pag_ici_lock);
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
- return xfs_reclaim_inode(ip, flags);
+ /*
+ * If the inode is still dirty, then flush it out. If the inode
+ * is not in the AIL, then it will be OK to flush it delwri as
+ * long as xfs_iflush() does not keep any references to the inode.
+ * We leave that decision up to xfs_iflush() since it has the
+ * knowledge of whether it's OK to simply do a delwri flush of
+ * the inode or whether we need to wait until the inode is
+ * pulled from the AIL.
+ * We get the flush lock regardless, though, just to make sure
+ * we don't free it while it is being flushed.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
+
+ /*
+ * In the case of a forced shutdown we rely on xfs_iflush() to
+ * wait for the inode to be unpinned before returning an error.
+ */
+ if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+ /* synchronize with xfs_iflush_done */
+ xfs_iflock(ip);
+ xfs_ifunlock(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_ireclaim(ip);
+ return 0;
}
int
@@ -780,6 +769,6 @@ xfs_reclaim_inodes(
xfs_mount_t *mp,
int mode)
{
- return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
- XFS_ICI_RECLAIM_TAG);
+ return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+ XFS_ICI_RECLAIM_TAG, 1);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d9183..ea932b43335 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag);
+ int flags, int tag, int write_lock);
#endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76fe8a2..873e07e2907 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}
/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d1483a4f71b..84ca1cf16a1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -114,10 +114,82 @@ xfs_swapext(
return error;
}
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check root block of temp in btree form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+
+ /* Check root block of target in btree form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ return 0;
+}
+
int
xfs_swap_extents(
- xfs_inode_t *ip,
- xfs_inode_t *tip,
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
xfs_swapext_t *sxp)
{
xfs_mount_t *mp;
@@ -161,13 +233,6 @@ xfs_swap_extents(
goto out_unlock;
}
- /* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
if (VN_CACHED(VFS_I(tip)) != 0) {
error = xfs_flushinval_pages(tip, 0, -1,
FI_REMAPF_LOCKED);
@@ -189,13 +254,12 @@ xfs_swap_extents(
goto out_unlock;
}
- /*
- * If the target has extended attributes, the tmp file
- * must also in order to ensure the correct data fork
- * format.
- */
- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
- error = XFS_ERROR(EINVAL);
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_fs_cmn_err(CE_NOTE, mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __FILE__, ip->i_ino);
goto out_unlock;
}
@@ -276,6 +340,16 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
+ * Fix the in-memory data fork values that are dependent on the fork
+ * offset in the inode. We can't assume they remain the same as attr2
+ * has dynamic fork offsets.
+ */
+ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fa402a6bbbc..155e798f30a 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -73,7 +73,6 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush));
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 391d36b0e68..ef77fd88c8e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2842,13 +2842,9 @@ xfs_iflush(
/*
* If the inode isn't dirty, then just release the inode flush lock and
- * do nothing. Treat stale inodes the same; we cannot rely on the
- * backing buffer remaining stale in cache for the remaining life of
- * the stale inode and so xfs_itobp() below may give us a buffer that
- * no longer contains inodes below. Doing this stale check here also
- * avoids forcing the log on pinned, stale inodes.
+ * do nothing.
*/
- if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
+ if (xfs_inode_clean(ip)) {
xfs_ifunlock(ip);
return 0;
}
@@ -2872,6 +2868,19 @@ xfs_iflush(
xfs_iunpin_wait(ip);
/*
+ * For stale inodes we cannot rely on the backing buffer remaining
+ * stale in cache for the remaining life of the stale inode and so
+ * xfs_itobp() below may give us a buffer that no longer contains
+ * inodes below. We have to check this after ensuring the inode is
+ * unpinned so that it is safe to reclaim the stale inode after the
+ * flush call.
+ */
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
+
+ /*
* This may have been unpinned because the filesystem is shutting
* down forcibly. If that's the case we must not write this inode
* to disk, because the log record didn't make it to disk!
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9e15a118536..6be05f756d5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1517,6 +1517,8 @@ xfs_rtfree_range(
*/
error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
&postblock);
+ if (error)
+ return error;
/*
* If there are blocks not being freed at the front of the
* old extent, add summary data for them to be allocated.