diff options
Diffstat (limited to 'fs')
55 files changed, 1856 insertions, 722 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6fcb1e7095c..92828281a30 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) buffer = kmap(page); offset = page_offset(page); - retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); + retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); if (retval < 0) goto done; diff --git a/fs/Kconfig b/fs/Kconfig index a97263be6a9..0e7da7bb5d9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -186,32 +186,7 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" - -config NILFS2_FS - tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - select CRC32 - help - NILFS2 is a log-structured file system (LFS) supporting continuous - snapshotting. In addition to versioning capability of the entire - file system, users can even restore files mistakenly overwritten or - destroyed just a few seconds ago. Since this file system can keep - consistency like conventional LFS, it achieves quick recovery after - system crashes. - - NILFS2 creates a number of checkpoints every few seconds or per - synchronous write basis (unless there is no change). Users can - select significant versions among continuously created checkpoints, - and can change them into snapshots which will be preserved for long - periods until they are changed back to checkpoints. Each - snapshot is mountable as a read-only file system concurrently with - its writable mount, and this feature is convenient for online backup. - - Some features including atime, extended attributes, and POSIX ACLs, - are not supported yet. - - To compile this file system support as a module, choose M here: the - module will be called nilfs2. If unsure, say N. +source "fs/nilfs2/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/block_dev.c b/fs/block_dev.c index 3a6d4fb2a32..94dfda24c06 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev) EXPORT_SYMBOL(bdget); +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_inode->i_count); + return bdev; +} + long nr_blockdev_pages(void) { struct block_device *bdev; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 6e4f6c50a12..019e8af449a 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work) * list */ if (worker->idle) { - spin_lock_irqsave(&worker->workers->lock, flags); + spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, &worker->workers->worker_list); - spin_unlock_irqrestore(&worker->workers->lock, flags); + spin_unlock(&worker->workers->lock); } if (!worker->working) { wake = 1; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60a45f3a4e9..3fdcc0512d3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) btrfs_disk_key_to_cpu(&k1, disk); - if (k1.objectid > k2->objectid) - return 1; - if (k1.objectid < k2->objectid) - return -1; - if (k1.type > k2->type) - return 1; - if (k1.type < k2->type) - return -1; - if (k1.offset > k2->offset) - return 1; - if (k1.offset < k2->offset) - return -1; - return 0; + return btrfs_comp_cpu_keys(&k1, k2); } /* @@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) > 2) - return 0; - if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root struct extent_buffer *b; int slot; int ret; + int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -1737,8 +1723,6 @@ again: p->locks[level] = 1; if (cow) { - int wret; - /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -1749,12 +1733,12 @@ again: btrfs_set_path_blocking(p); - wret = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (wret) { + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { free_extent_buffer(b); - ret = wret; + ret = err; goto done; } } @@ -1793,41 +1777,45 @@ cow_done: ret = bin_search(b, key, level, &slot); if (level != 0) { - if (ret && slot > 0) + int dec = 0; + if (ret && slot > 0) { + dec = 1; slot -= 1; + } p->slots[level] = slot; - ret = setup_nodes_for_search(trans, root, p, b, level, + err = setup_nodes_for_search(trans, root, p, b, level, ins_len); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - else if (ret) + if (err) { + ret = err; goto done; + } b = p->nodes[level]; slot = p->slots[level]; unlock_up(p, level, lowest_unlock); - /* this is only true while dropping a snapshot */ if (level == lowest_level) { - ret = 0; + if (dec) + p->slots[level]++; goto done; } - ret = read_block_for_search(trans, root, p, + err = read_block_for_search(trans, root, p, &b, level, slot, key); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - - if (ret == -EIO) + if (err) { + ret = err; goto done; + } if (!p->skip_locking) { - int lret; - btrfs_clear_path_blocking(p, NULL); - lret = btrfs_try_spin_lock(b); + err = btrfs_try_spin_lock(b); - if (!lret) { + if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); btrfs_clear_path_blocking(p, b); @@ -1837,16 +1825,14 @@ cow_done: p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret; - btrfs_set_path_blocking(p); - sret = split_leaf(trans, root, key, - p, ins_len, ret == 0); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); btrfs_clear_path_blocking(p, NULL); - BUG_ON(sret > 0); - if (sret) { - ret = sret; + BUG_ON(err > 0); + if (err) { + ret = err; goto done; } } @@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -4042,10 +4028,9 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level, + struct btrfs_key *key, int level, int cache_only, u64 min_trans) { - int level = lowest_level; int slot; struct extent_buffer *c; @@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, c = path->nodes[level]; next: if (slot >= btrfs_header_nritems(c)) { - level++; - if (level == BTRFS_MAX_LEVEL) + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) return 1; - continue; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(root, path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; } + if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { @@ -4146,7 +4160,8 @@ again: * advance the path if there are now more items available. */ if (nritems > 0 && path->slots[0] < nritems - 1) { - path->slots[0]++; + if (ret == 0) + path->slots[0]++; ret = 0; goto done; } @@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root, path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.type == type) - return 0; if (found_key.objectid < min_objectid) break; + if (found_key.type == type) + return 0; if (found_key.objectid == min_objectid && found_key.type < type) break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a87383871..837435ce84c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -481,7 +481,7 @@ struct btrfs_shared_data_ref { struct btrfs_extent_inline_ref { u8 type; - u64 offset; + __le64 offset; } __attribute__ ((__packed__)); /* old style backrefs item */ @@ -689,6 +689,7 @@ struct btrfs_space_info { struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; + atomic_t caching_threads; }; /* @@ -707,6 +708,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -716,24 +720,37 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; spinlock_t lock; - struct mutex cache_mutex; u64 pinned; u64 reserved; u64 flags; - int cached; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; int ro; int dirty; + /* cache tracking stuff */ + wait_queue_head_t caching_q; + int cached; + struct btrfs_space_info *space_info; /* free space cache stuff */ spinlock_t tree_lock; - struct rb_root free_space_bytes; struct rb_root free_space_offset; + u64 free_space; /* block group cache stuff */ struct rb_node cache_node; @@ -808,6 +825,7 @@ struct btrfs_fs_info { struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + struct rw_semaphore extent_commit_sem; /* * this protects the ordered operations list only while we are @@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c95f7..e83be2e4602 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1639,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + init_rwsem(&fs_info->extent_commit_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); @@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + fs_info->closing = 2; + smp_mb(); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); + btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5aca3997d4..dc84daee6bc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> +#include <linux/kthread.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } /* + * We always set EXTENT_LOCKED for the super mirror extents so we don't + * overwrite them, so those bits need to be unset. Also, if we are unmounting + * with pinned extents still sitting there because we had a block group caching, + * we need to clear those now, since we are done. + */ +void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +{ + u64 start, end, last = 0; + int ret; + + while (1) { + ret = find_first_extent_bit(&info->pinned_extents, last, + &start, &end, + EXTENT_LOCKED|EXTENT_DIRTY); + if (ret) + break; + + clear_extent_bits(&info->pinned_extents, start, end, + EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); + last = end+1; + } +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + try_lock_extent(&fs_info->pinned_extents, + logical[nr], + logical[nr] + stripe_len - 1, GFP_NOFS); + } + kfree(logical); + } + + return 0; +} + +/* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static int add_new_free_space(struct btrfs_block_group_cache *block_group, +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end) { - u64 extent_start, extent_end, size; + u64 extent_start, extent_end, size, total_added = 0; int ret; while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY|EXTENT_LOCKED); if (ret) break; @@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); @@ -178,84 +237,80 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, if (start < end) { size = end - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - return 0; + return total_added; } -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(&root->fs_info->mapping_tree, - cache->key.objectid, bytenr, 0, - &logical, &nr, &stripe_len); - BUG_ON(ret); - while (nr--) { - btrfs_remove_free_space(cache, logical[nr], - stripe_len); - } - kfree(logical); - } - return 0; -} - -static int cache_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +static int caching_kthread(void *data) { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 last = 0; struct btrfs_path *path; int ret = 0; struct btrfs_key key; struct extent_buffer *leaf; int slot; - u64 last; - - if (!block_group) - return 0; + u64 total_found = 0; - root = root->fs_info->extent_root; - - if (block_group->cached) - return 0; + BUG_ON(!fs_info); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + atomic_inc(&block_group->space_info->caching_threads); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + /* - * we get into deadlocks with paths held by callers of this function. - * since the alloc_mutex is protecting things right now, just - * skip the locking here + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only */ path->skip_locking = 1; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + path->search_commit_root = 1; + path->reada = 2; + key.objectid = last; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto err; while (1) { + smp_mb(); + if (block_group->fs_info->closing > 1) { + last = (u64)-1; + break; + } + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->extent_root, path); if (ret < 0) goto err; - if (ret == 0) - continue; - else + else if (ret) break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + btrfs_release_path(fs_info->extent_root, path); + up_read(&fs_info->extent_commit_sem); + schedule_timeout(1); + goto again; + } + + continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < block_group->key.objectid) @@ -266,24 +321,59 @@ static int cache_block_group(struct btrfs_root *root, break; if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { - add_new_free_space(block_group, root->fs_info, last, - key.objectid); - + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); last = key.objectid + key.offset; } + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&block_group->caching_q); + } next: path->slots[0]++; } + ret = 0; - add_new_free_space(block_group, root->fs_info, last, - block_group->key.objectid + - block_group->key.offset); + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); - block_group->cached = 1; - remove_sb_from_cache(root, block_group); - ret = 0; err: btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); + atomic_dec(&block_group->space_info->caching_threads); + wake_up(&block_group->caching_q); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ + struct task_struct *tsk; + int ret = 0; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return ret; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + return ret; } @@ -2387,13 +2477,29 @@ fail: } +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + atomic_inc(&cache->count); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_group_cache *cache, *entry; - struct rb_node *n; + struct btrfs_block_group_cache *cache; int err = 0; - int werr = 0; struct btrfs_path *path; u64 last = 0; @@ -2402,39 +2508,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; while (1) { - cache = NULL; - spin_lock(&root->fs_info->block_group_cache_lock); - for (n = rb_first(&root->fs_info->block_group_cache_tree); - n; n = rb_next(n)) { - entry = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - if (entry->dirty) { - cache = entry; - break; - } + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); } - spin_unlock(&root->fs_info->block_group_cache_lock); - if (!cache) - break; + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } cache->dirty = 0; - last += cache->key.offset; + last = cache->key.objectid + cache->key.offset; - err = write_one_cache_group(trans, root, - path, cache); - /* - * if we fail to write the cache group, we want - * to keep it marked dirty in hopes that a later - * write will work - */ - if (err) { - werr = err; - continue; - } + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); } + btrfs_free_path(path); - return werr; + return 0; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2484,6 +2586,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->force_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); return 0; } @@ -2947,13 +3050,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - if (pin) { + if (pin) set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); - } else { - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - } while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2969,14 +3068,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); fs_info->total_pinned += len; } else { + int unpin = 0; + + /* + * in order to not race with the block group caching, we + * only want to unpin the extent if we are cached. If + * we aren't cached, we want to start async caching this + * block group so we can free the extent the next time + * around. + */ spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + unpin = (cache->cached == BTRFS_CACHE_FINISHED); + if (likely(unpin)) { + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + fs_info->total_pinned -= len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - fs_info->total_pinned -= len; - if (cache->cached) + + if (likely(unpin)) + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + len -1, + GFP_NOFS); + else + cache_block_group(cache); + + if (unpin) btrfs_add_free_space(cache, bytenr, len); } btrfs_put_block_group(cache); @@ -3030,6 +3149,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) &start, &end, EXTENT_DIRTY); if (ret) break; + set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } @@ -3058,6 +3178,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + return ret; } @@ -3436,6 +3557,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) } /* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); + + if (block_group_cache_done(cache)) { + finish_wait(&cache->caching_q, &wait); + return 0; + } + schedule(); + finish_wait(&cache->caching_q, &wait); + + wait_event(cache->caching_q, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); + return 0; +} + +enum btrfs_loop_type { + LOOP_CACHED_ONLY = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == block start @@ -3460,6 +3620,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + bool found_uncached_bg = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3491,15 +3652,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (!last_ptr) { + if (!last_ptr) empty_cluster = 0; - loop = 1; - } if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (block_group && block_group_bits(block_group, data)) { + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + */ + if (block_group && block_group_bits(block_group, data) && + block_group_cache_done(block_group)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -3522,21 +3686,35 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { u64 offset; + int cached; atomic_inc(&block_group->count); search_start = block_group->key.objectid; have_block_group: - if (unlikely(!block_group->cached)) { - mutex_lock(&block_group->cache_mutex); - ret = cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); - if (ret) { - btrfs_put_block_group(block_group); - break; + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + /* + * we want to start caching kthreads, but not too many + * right off the bat so we don't overwhelm the system, + * so only start them if there are less than 2 and we're + * in the initial allocation phase. + */ + if (loop > LOOP_CACHING_NOWAIT || + atomic_read(&space_info->caching_threads) < 2) { + ret = cache_block_group(block_group); + BUG_ON(ret); } } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { + found_uncached_bg = true; + + /* if we only want cached bgs, loop */ + if (loop == LOOP_CACHED_ONLY) + goto loop; + } + if (unlikely(block_group->ro)) goto loop; @@ -3615,14 +3793,21 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } + } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + spin_unlock(&last_ptr->refill_lock); + + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; } + /* * at this point we either didn't find a cluster * or we weren't able to allocate a block from our * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < 2) { + if (loop < LOOP_NO_EMPTY_SIZE) { btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); @@ -3633,11 +3818,17 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset) + if (!offset && (cached || (!cached && + loop == LOOP_CACHING_NOWAIT))) { goto loop; + } else if (!offset && (!cached && + loop > LOOP_CACHING_NOWAIT)) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + goto have_block_group; + } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ if (search_start + num_bytes >= search_end) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -3683,13 +3874,26 @@ loop: } up_read(&space_info->groups_sem); - /* loop == 0, try to find a clustered alloc in every block group - * loop == 1, try again after forcing a chunk allocation - * loop == 2, set empty_size and empty_cluster to 0 and try again + /* LOOP_CACHED_ONLY, only search fully cached block groups + * LOOP_CACHING_NOWAIT, search partially cached block groups, but + * dont wait foR them to finish caching + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again */ - if (!ins->objectid && loop < 3 && - (empty_size || empty_cluster || allowed_chunk_alloc)) { - if (loop >= 2) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && + (found_uncached_bg || empty_size || empty_cluster || + allowed_chunk_alloc)) { + if (found_uncached_bg) { + found_uncached_bg = false; + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; + } + } + + if (loop == LOOP_ALLOC_CHUNK) { empty_size = 0; empty_cluster = 0; } @@ -3702,7 +3906,7 @@ loop: space_info->force_alloc = 1; } - if (loop < 3) { + if (loop < LOOP_NO_EMPTY_SIZE) { loop++; goto search; } @@ -3798,7 +4002,7 @@ again: num_bytes, data, 1); goto again; } - if (ret) { + if (ret == -ENOSPC) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -3806,7 +4010,6 @@ again: "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); dump_space_info(sinfo, num_bytes); - BUG(); } return ret; @@ -3844,7 +4047,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); - update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (!ret) + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; } @@ -4006,9 +4211,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - mutex_lock(&block_group->cache_mutex); - cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); + cache_block_group(block_group); + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); @@ -4039,7 +4244,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, empty_size, hint_byte, search_end, ins, 0); - BUG_ON(ret); + if (ret) + return ret; if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6955,11 +7161,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) &info->block_group_cache_tree); spin_unlock(&info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + WARN_ON(atomic_read(&block_group->count) != 1); kfree(block_group); @@ -7025,9 +7236,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + cache->fs_info = info; + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -7036,6 +7257,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + remove_sb_from_cache(root, cache); + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->cached = BTRFS_CACHE_FINISHED; + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + } ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), @@ -7079,10 +7320,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.objectid = chunk_offset; cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + + /* + * we only want to have 32k of ram per block group for keeping track + * of free space, and if we pass 1/2 of that we want to start + * converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7091,6 +7341,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->cached = BTRFS_CACHE_FINISHED; + remove_sb_from_cache(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); @@ -7149,7 +7405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); spin_unlock(&root->fs_info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they @@ -7158,11 +7414,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; spin_unlock(&block_group->space_info->lock); - block_group->space_info->full = 0; + + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4538e48581a..af99b78b288 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,45 +16,46 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/math64.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; -}; +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node) +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, + u64 offset) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_free_space *info; + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div64_u64(offset, sectorsize)); +} - while (*p) { - parent = *p; - info = rb_entry(parent, struct btrfs_free_space, offset_index); +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ + return (unsigned long)(div64_u64(bytes, sectorsize)); +} - if (offset < info->offset) - p = &(*p)->rb_left; - else if (offset > info->offset) - p = &(*p)->rb_right; - else - return -EEXIST; - } +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; - rb_link_node(node, parent, p); - rb_insert_color(node, root); + bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; + bitmap_start = offset - block_group->key.objectid; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += block_group->key.objectid; - return 0; + return bitmap_start; } -static int tree_insert_bytes(struct rb_root *root, u64 bytes, - struct rb_node *node) +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, while (*p) { parent = *p; - info = rb_entry(parent, struct btrfs_free_space, bytes_index); + info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (bytes < info->bytes) + if (offset < info->offset) { p = &(*p)->rb_left; - else + } else if (offset > info->offset) { p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + WARN_ON(info->bitmap); + p = &(*p)->rb_right; + } else { + WARN_ON(!info->bitmap); + p = &(*p)->rb_left; + } + } } rb_link_node(node, parent, p); @@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, /* * searches the tree for the given offset. * - * fuzzy == 1: this is used for allocations where we are given a hint of where - * to look for free space. Because the hint may not be completely on an offset - * mark, or the hint may no longer point to free space we need to fudge our - * results a bit. So we look for free space starting at or after offset with at - * least bytes size. We prefer to find as close to the given offset as we can. - * Also if the offset is within a free space range, then we will return the free - * space that contains the given offset, which means we can return a free space - * chunk with an offset before the provided offset. - * - * fuzzy == 0: this is just a normal tree search. Give us the free space that - * starts at the given offset which is at least bytes size, and if its not there - * return NULL. + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. */ -static struct btrfs_free_space *tree_search_offset(struct rb_root *root, - u64 offset, u64 bytes, - int fuzzy) +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, + u64 offset, int bitmap_only, int fuzzy) { - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + struct rb_node *n = block_group->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } - while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; - if (offset < entry->offset) { - if (fuzzy && - (!ret || entry->offset < ret->offset) && - (bytes <= entry->bytes)) - ret = entry; + if (offset < entry->offset) n = n->rb_left; - } else if (offset > entry->offset) { - if (fuzzy && - (entry->offset + entry->bytes - 1) >= offset && - bytes <= entry->bytes) { - ret = entry; - break; - } + else if (offset > entry->offset) n = n->rb_right; - } else { - if (bytes > entry->bytes) { - n = n->rb_right; - continue; - } - ret = entry; + else break; - } } - return ret; -} - -/* - * return a chunk at least bytes size, as close to offset that we can get. - */ -static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, - u64 offset, u64 bytes) -{ - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; - while (n) { - entry = rb_entry(n, struct btrfs_free_space, bytes_index); + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; - if (bytes < entry->bytes) { + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { /* - * We prefer to get a hole size as close to the size we - * are asking for so we don't take small slivers out of - * huge holes, but we also want to get as close to the - * offset as possible so we don't have a whole lot of - * fragmentation. + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry */ - if (offset <= entry->offset) { - if (!ret) - ret = entry; - else if (entry->bytes < ret->bytes) - ret = entry; - else if (entry->offset < ret->offset) - ret = entry; + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } } - n = n->rb_left; - } else if (bytes > entry->bytes) { - n = n->rb_right; + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); } else { - /* - * Ok we may have multiple chunks of the wanted size, - * so we don't want to take the first one we find, we - * want to take the one closest to our given offset, so - * keep searching just in case theres a better match. - */ - n = n->rb_right; - if (offset > entry->offset) - continue; - else if (!ret || entry->offset < ret->offset) - ret = entry; + if (fuzzy) + return entry; + else + return NULL; } } - return ret; + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; } static void unlink_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); - rb_erase(&info->bytes_index, &block_group->free_space_bytes); + block_group->free_extents--; + block_group->free_space -= info->bytes; } static int link_free_space(struct btrfs_block_group_cache *block_group, @@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, { int ret = 0; - - BUG_ON(!info->bytes); + BUG_ON(!info->bitmap && !info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, - &info->offset_index); + &info->offset_index, (info->bitmap != NULL)); if (ret) return ret; - ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, - &info->bytes_index); - if (ret) - return ret; + block_group->free_space += info->bytes; + block_group->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ + u64 max_bytes, possible_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + + possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + + (sizeof(struct btrfs_free_space) * + block_group->extents_thresh); + + if (possible_bytes > max_bytes) { + int extent_bytes = max_bytes - + (block_group->total_bitmaps * PAGE_CACHE_SIZE); + + if (extent_bytes <= 0) { + block_group->extents_thresh = 0; + return; + } + + block_group->extents_thresh = extent_bytes / + (sizeof(struct btrfs_free_space)); + } +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + clear_bit(i, info->bitmap); + + info->bytes -= bytes; + block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + set_bit(i, info->bitmap); + + info->bytes += bytes; + block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, block_group->sectorsize); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * block_group->sectorsize) + + bitmap_info->offset; + *bytes = (u64)(found_bits) * block_group->sectorsize; + return 0; + } + + return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache + *block_group, u64 *offset, + u64 *bytes, int debug) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!block_group->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, *offset), + 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset) +{ + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = (int)div64_u64(block_group->key.offset + + bytes_per_bg - 1, bytes_per_bg); + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); + link_free_space(block_group, info); + block_group->total_bitmaps++; + + recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + +again: + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, + end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, + *offset), + 1, 0); + if (!bitmap_info) + return -EINVAL; + + if (!bitmap_info->bitmap) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + int added = 0; + u64 bytes, offset, end; + int ret; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (block_group->free_extents < block_group->extents_thresh && + info->bytes > block_group->sectorsize * 4) + return 0; + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return 0; + + bytes = info->bytes; + offset = info->offset; + +again: + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize); + + if (offset >= bitmap_info->offset && offset + bytes > end) { + bitmap_set_bits(block_group, bitmap_info, offset, + end - offset); + bytes -= end - offset; + offset = end; + added = 0; + } else if (offset >= bitmap_info->offset && offset + bytes <= end) { + bitmap_set_bits(block_group, bitmap_info, offset, bytes); + bytes = 0; + } else { + BUG(); + } + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(block_group, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&block_group->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kzalloc(sizeof(struct btrfs_free_space), + GFP_NOFS); + if (!info) { + spin_lock(&block_group->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&block_group->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + } return ret; } @@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { - struct btrfs_free_space *right_info; - struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info = NULL; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *info = NULL; int ret = 0; @@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, * are adding, if there is remove that struct and add a new one to * cover the entire range */ - right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 0); - left_info = tree_search_offset(&block_group->free_space_offset, - offset-1, 0, 1); + right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(block_group, offset - 1, 0, 0); - if (right_info) { + /* + * If there was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + if ((!left_info || left_info->bitmap) && + (!right_info || right_info->bitmap)) { + ret = insert_into_bitmap(block_group, info); + + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } + } + + if (right_info && !right_info->bitmap) { unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); } - if (left_info && left_info->offset + left_info->bytes == offset) { + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; @@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, ret = link_free_space(block_group, info); if (ret) kfree(info); - +out: spin_unlock(&block_group->tree_lock); if (ret) { - printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); BUG_ON(ret == -EEXIST); } @@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; int ret = 0; spin_lock(&block_group->tree_lock); - info = tree_search_offset(&block_group->free_space_offset, offset, 0, - 1); - if (info && info->offset == offset) { - if (info->bytes < bytes) { - printk(KERN_ERR "Found free space at %llu, size %llu," - "trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); +again: + info = tree_search_offset(block_group, offset, 0, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + BITS_PER_BITMAP * + block_group->sectorsize - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; - spin_unlock(&block_group->tree_lock); - goto out; + goto out_lock; } - unlink_free_space(block_group, info); - if (info->bytes == bytes) { - kfree(info); - spin_unlock(&block_group->tree_lock); - goto out; + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(block_group, info); + if (info->bitmap) { + kfree(info->bitmap); + block_group->total_bitmaps--; } + kfree(info); + goto out_lock; + } + if (!info->bitmap && info->offset == offset) { + unlink_free_space(block_group, info); info->offset += bytes; info->bytes -= bytes; + link_free_space(block_group, info); + goto out_lock; + } - ret = link_free_space(block_group, info); - spin_unlock(&block_group->tree_lock); - BUG_ON(ret); - } else if (info && info->offset < offset && - info->offset + info->bytes >= offset + bytes) { + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { u64 old_start = info->offset; /* * we're freeing space in the middle of the info, @@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->offset = offset + bytes; info->bytes = old_end - info->offset; ret = link_free_space(block_group, info); - BUG_ON(ret); + WARN_ON(ret); + if (ret) + goto out_lock; } else { /* the hole we're creating ends at the end * of the info struct, just free the info @@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, kfree(info); } spin_unlock(&block_group->tree_lock); - /* step two, insert a new info struct to cover anything - * before the hole + + /* step two, insert a new info struct to cover + * anything before the hole */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - BUG_ON(ret); - } else { - spin_unlock(&block_group->tree_lock); - if (!info) { - printk(KERN_ERR "couldn't find space %llu to free\n", - (unsigned long long)offset); - printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->key.offset); - btrfs_dump_free_space(block_group, bytes); - } else if (info) { - printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " - "but wanted offset=%llu bytes=%llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - } - WARN_ON(1); + WARN_ON(ret); + goto out; } + + ret = remove_from_bitmap(block_group, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&block_group->tree_lock); out: return ret; } @@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", (unsigned long long)info->offset, - (unsigned long long)info->bytes); + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); } @@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; + bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; + bitmap = cluster->points_to_bitmap; + cluster->block_group = NULL; cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + cluster->points_to_bitmap = false; + + if (bitmap) + goto out; + node = rb_first(&cluster->root); - while(node) { + while (node) { entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - link_free_space(block_group, entry); + BUG_ON(entry->bitmap); + tree_insert_offset(&block_group->free_space_offset, + entry->offset, &entry->offset_index, 0); } - list_del_init(&cluster->block_group_list); - - btrfs_put_block_group(cluster->block_group); - cluster->block_group = NULL; cluster->root.rb_node = NULL; + out: spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); return 0; } @@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_free_space *info; struct rb_node *node; struct btrfs_free_cluster *cluster; - struct btrfs_free_cluster *safe; + struct list_head *head; spin_lock(&block_group->tree_lock); - - list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, - block_group_list) { + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } } - while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, bytes_index); + while ((node = rb_last(&block_group->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); unlink_free_space(block_group, info); + if (info->bitmap) + kfree(info->bitmap); kfree(info); if (need_resched()) { spin_unlock(&block_group->tree_lock); @@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) spin_lock(&block_group->tree_lock); } } + spin_unlock(&block_group->tree_lock); } @@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; u64 ret = 0; spin_lock(&block_group->tree_lock); - entry = tree_search_offset(&block_group->free_space_offset, offset, - bytes + empty_size, 1); + entry = find_free_space(block_group, &offset, &bytes_search, 0); if (!entry) - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes + empty_size); - if (entry) { + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(block_group, entry, offset, bytes); + if (!entry->bytes) { + unlink_free_space(block_group, entry); + kfree(entry->bitmap); + kfree(entry); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + } else { unlink_free_space(block_group, entry); - ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; - if (!entry->bytes) kfree(entry); else link_free_space(block_group, entry); } + +out: spin_unlock(&block_group->tree_lock); return ret; @@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space( return ret; } +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space *entry; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + if (!cluster->points_to_bitmap) + goto out; + + if (cluster->block_group != block_group) + goto out; + + entry = tree_search_offset(block_group, search_start, 0, 0); + + if (!entry || !entry->bitmap) + goto out; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(block_group, entry, &search_start, + &search_bytes); + if (err) + goto out; + + ret = search_start; + bitmap_clear_bits(block_group, entry, ret, bytes); +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset @@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; + if (cluster->points_to_bitmap) + return btrfs_alloc_from_bitmap(block_group, cluster, bytes, + min_start); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, } out: spin_unlock(&cluster->lock); + return ret; } +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -1; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + cluster->points_to_bitmap = true; + + return 0; +} + /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes free and up to empty_size + bytes free. @@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry = NULL; struct rb_node *node; struct btrfs_free_space *next; - struct btrfs_free_space *last; + struct btrfs_free_space *last = NULL; u64 min_bytes; u64 window_start; u64 window_free; u64 max_extent = 0; - int total_retries = 0; + bool found_bitmap = false; int ret; /* for metadata, allow allocates with more holes */ @@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } again: - min_bytes = min(min_bytes, bytes + empty_size); - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, min_bytes); + entry = tree_search_offset(block_group, offset, found_bitmap, 1); if (!entry) { ret = -ENOSPC; goto out; } + + /* + * If found_bitmap is true, we exhausted our search for extent entries, + * and we just want to search all of the bitmaps that we can find, and + * ignore any extent entries we find. + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { + struct rb_node *node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, + offset, bytes + empty_size, + min_bytes); + if (!ret) + goto got_it; + } + + if (!node) { + ret = -ENOSPC; + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + /* + * We already searched all the extent entries from the passed in offset + * to the end and didn't find enough space for the cluster, and we also + * didn't find any bitmaps that met our criteria, just go ahead and exit + */ + if (found_bitmap) { + ret = -ENOSPC; + goto out; + } + + cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; - while(1) { + while (1) { /* out window is just right, lets fill it */ if (window_free >= bytes + empty_size) break; node = rb_next(&last->offset_index); if (!node) { + if (found_bitmap) + goto again; ret = -ENOSPC; goto out; } next = rb_entry(node, struct btrfs_free_space, offset_index); /* + * we found a bitmap, so if this search doesn't result in a + * cluster, we know to go and search again for the bitmaps and + * start looking for space there + */ + if (next->bitmap) { + if (!found_bitmap) + offset = next->offset; + found_bitmap = true; + last = next; + continue; + } + + /* * we haven't filled the empty size and the window is * very large. reset and try again */ @@ -655,19 +1234,6 @@ again: window_free = entry->bytes; last = entry; max_extent = 0; - total_retries++; - if (total_retries % 64 == 0) { - if (min_bytes >= (bytes + empty_size)) { - ret = -ENOSPC; - goto out; - } - /* - * grow our allocation a bit, we're not having - * much luck - */ - min_bytes *= 2; - goto again; - } } else { last = next; window_free += next->bytes; @@ -685,11 +1251,19 @@ again: * The cluster includes an rbtree, but only uses the offset index * of each free space cache entry. */ - while(1) { + while (1) { node = rb_next(&entry->offset_index); - unlink_free_space(block_group, entry); + if (entry->bitmap && node) { + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } else if (entry->bitmap && !node) { + break; + } + + rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index); + &entry->offset_index, 0); BUG_ON(ret); if (!node || entry == last) @@ -697,8 +1271,10 @@ again: entry = rb_entry(node, struct btrfs_free_space, offset_index); } - ret = 0; + cluster->max_size = max_extent; +got_it: + ret = 0; atomic_inc(&block_group->count); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root.rb_node = NULL; cluster->max_size = 0; + cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 266fb876405..890a8e79011 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -19,6 +19,14 @@ #ifndef __BTRFS_FREE_SPACE_CACHE #define __BTRFS_FREE_SPACE_CACHE +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 791eab19e33..56fe83fa60c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); - path->reada = -1; BUG_ON(!path); + path->reada = -1; /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6d6523da0a3..0d126be22b6 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) } printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", (unsigned long long)btrfs_header_bytenr(c), - btrfs_header_level(c), nr, + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); @@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && - btrfs_header_level(c) != 1) + level != 1) BUG(); if (btrfs_header_level(next) != - btrfs_header_level(c) - 1) + level - 1) BUG(); btrfs_print_tree(root, next); free_extent_buffer(next); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 00839793477..e71264d1c2c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -670,6 +670,8 @@ again: err = ret; goto out; } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; eb = path2->nodes[level]; WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != @@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2dbf1c1f56e..cdbb5022da5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +static noinline void switch_commit_root(struct btrfs_root *root) +{ + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); - while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) @@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_key, &root->root_item); BUG_ON(ret); - btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + + if (root != root->fs_info->extent_root) + switch_commit_root(root); + return 0; } @@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); } + + down_write(&fs_info->extent_commit_sem); + switch_commit_root(fs_info->extent_root); + up_write(&fs_info->extent_commit_sem); + return 0; } @@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (root->commit_root != root->node) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + switch_commit_root(root); btrfs_set_root_node(&root->root_item, root->node); } @@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root) super->root_level = root_item->level; } +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->in_commit; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit || snap_pending) { - if (flush_on_commit) - btrfs_start_delalloc_inodes(root); + if (flush_on_commit) { + btrfs_start_delalloc_inodes(root); + ret = btrfs_wait_ordered_extents(root, 0); + BUG_ON(ret); + } else if (snap_pending) { ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } @@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - free_extent_buffer(root->fs_info->tree_root->commit_root); - root->fs_info->tree_root->commit_root = - btrfs_root_node(root->fs_info->tree_root); + switch_commit_root(root->fs_info->tree_root); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - root->fs_info->chunk_root->commit_root = - btrfs_root_node(root->fs_info->chunk_root); + switch_commit_root(root->fs_info->chunk_root); update_super_roots(root); @@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 961c3ee5a2e..663c6740491 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c13922206d1..d91b0de7c50 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, return -ENOENT; inode = read_one_inode(root, key->objectid); - BUG_ON(!dir); + BUG_ON(!inode); ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9cd76..5dbefd11b4a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -721,7 +721,8 @@ error: */ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 num_bytes, u64 *start) + u64 num_bytes, u64 *start, + u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) goto error; - ret = btrfs_previous_item(root, path, 0, key.type); - if (ret < 0) - goto error; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto error; + if (ret > 0) + start_found = 1; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); while (1) { @@ -803,6 +808,10 @@ no_more_items: if (last_byte < search_start) last_byte = search_start; hole_size = key.offset - last_byte; + + if (hole_size > *max_avail) + *max_avail = hole_size; + if (key.offset > last_byte && hole_size >= num_bytes) { *start = last_byte; @@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); return btrfs_update_device(trans, device); @@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; - goto done; + break; } l = path->nodes[0]; @@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) - goto done; + break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); @@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size); again: + max_avail = 0; if (!map || map->num_stripes != num_stripes) { kfree(map); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2219,7 +2230,8 @@ again: if (device->in_fs_metadata && avail >= min_free) { ret = find_free_dev_extent(trans, device, - min_free, &dev_offset); + min_free, &dev_offset, + &max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); @@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, } } - for (i = 0; i > nr; i++) { - struct btrfs_multi_bio *multi; - struct btrfs_bio_stripe *stripe; - int ret; - - length = 1; - ret = btrfs_map_block(map_tree, WRITE, buf[i], - &length, &multi, 0); - BUG_ON(ret); - - stripe = multi->stripes; - for (j = 0; j < multi->num_stripes; j++) { - if (stripe->physical >= physical && - physical < stripe->physical + length) - break; - } - BUG_ON(j >= multi->num_stripes); - kfree(multi); - } - *logical = buf; *naddrs = nr; *stripe_len = map->stripe_len; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7f19fefd3d4..42cec2a7c0c 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&tcon->num_reads, 0); atomic_set(&tcon->num_oplock_brks, 0); atomic_set(&tcon->num_opens, 0); + atomic_set(&tcon->num_posixopens, 0); + atomic_set(&tcon->num_posixmkdirs, 0); atomic_set(&tcon->num_closes, 0); atomic_set(&tcon->num_deletes, 0); atomic_set(&tcon->num_mkdirs, 0); @@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) atomic_read(&tcon->num_locks), atomic_read(&tcon->num_hardlinks), atomic_read(&tcon->num_symlinks)); - seq_printf(m, "\nOpens: %d Closes: %d" + seq_printf(m, "\nOpens: %d Closes: %d " "Deletes: %d", atomic_read(&tcon->num_opens), atomic_read(&tcon->num_closes), atomic_read(&tcon->num_deletes)); + seq_printf(m, "\nPosix Opens: %d " + "Posix Mkdirs: %d", + atomic_read(&tcon->num_posixopens), + atomic_read(&tcon->num_posixmkdirs)); seq_printf(m, "\nMkdirs: %d Rmdirs: %d", atomic_read(&tcon->num_mkdirs), atomic_read(&tcon->num_rmdirs)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 63f6cdfa563..6084d6379c0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -260,6 +260,8 @@ struct cifsTconInfo { atomic_t num_closes; atomic_t num_deletes; atomic_t num_mkdirs; + atomic_t num_posixopens; + atomic_t num_posixmkdirs; atomic_t num_rmdirs; atomic_t num_renames; atomic_t num_t2renames; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 922f5fe2084..1866bc2927d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1113,7 +1113,10 @@ PsxCreat: psx_create_err: cifs_buf_release(pSMB); - cifs_stats_inc(&tcon->num_mkdirs); + if (posix_flags & SMB_O_DIRECTORY) + cifs_stats_inc(&tcon->num_posixmkdirs); + else + cifs_stats_inc(&tcon->num_posixopens); if (rc == -EAGAIN) goto PsxCreat; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e16d7592116..fc44d316d0b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2452,10 +2452,10 @@ try_mount_again: tcon->local_lease = volume_info->local_lease; } if (pSesInfo) { - if (pSesInfo->capabilities & CAP_LARGE_FILES) { - sb->s_maxbytes = (u64) 1 << 63; - } else - sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ + if (pSesInfo->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; } /* BB FIXME fix time_gran to be larger for LANMAN sessions */ @@ -2726,6 +2726,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, strncpy(tcon->treeName, tree, MAX_TREE_SIZE); /* mostly informational -- no need to fail on error here */ + kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, bytes_left, is_unicode, nls_codepage); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 18afe57b246..82d83839655 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * junction to the new submount (ie to setup the fake directory * which represents a DFS referral). */ -void +static void cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -void +static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct cifs_sb_info *cifs_sb, bool adjust_tz) { @@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { - /* BB EOPNOSUPP disable SERVER_INUM? */ cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); + /* disable serverino if call not supported */ + if (rc1 == -EINVAL) + cifs_sb->mnt_cifs_flags &= + ~CIFS_MOUNT_SERVER_INUM; } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index af737bb56cb..259525c9abb 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, } (*new_auth_tok)->session_key.encrypted_key_size = (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } if (unlikely(data[(*packet_size)++] != 0x04)) { printk(KERN_WARNING "Unknown version number [%d]\n", data[(*packet_size) - 1]); @@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, rc = -EINVAL; goto out; } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } if (data[(*packet_size)++] != 0x62) { printk(KERN_WARNING "Unrecognizable packet\n"); rc = -EINVAL; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882..373fa90c796 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5f51fed5c75..b49908a167a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -788,7 +788,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1193,15 +1186,16 @@ write_begin_failed: * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before truncate - * finishes. + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) @@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { @@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 03ebb439ace..7ebae9a4ecc 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; @@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) @@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -712,6 +723,10 @@ out_alloc_put: gfs2_alloc_put(ip); } out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); out_uninit: gfs2_holder_uninit(&ip->i_gh); @@ -725,14 +740,21 @@ out_uninit: static void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; u64 fs_total, new_free; /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); if (fs_total > (m_sc->sc_total + l_sc->sc_total)) new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); else @@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode) fs_warn(sdp, "File system extended by %llu blocks.\n", (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); } /** @@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); @@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, brelse(dibh); gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return copied; @@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); @@ -865,6 +900,10 @@ failed: gfs2_quota_unlock(ip); gfs2_alloc_put(ip); } + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return ret; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 297421c0427..8b674b1f3a5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); @@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); atomic_inc(&gl->gl_ref); } /** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + +/** * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock * @@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { + int may_reclaim; + may_reclaim = (demote_ok(gl) && + (atomic_read(&gl->gl_ref) == 1 || + (gl->gl_name.ln_type == LM_TYPE_INODE && + atomic_read(&gl->gl_ref) <= 2))); spin_lock(&lru_lock); - if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { + if (list_empty(&gl->gl_lru) && may_reclaim) { list_add_tail(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); } @@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) } /** + * gfs2_glock_put_nolock() - Decrement reference count on glock + * @gl: The glock to put + * + * This function should only be used if the caller has its own reference + * to the glock, in addition to the one it is dropping. + */ + +void gfs2_glock_put_nolock(struct gfs2_glock *gl) +{ + if (atomic_dec_and_test(&gl->gl_ref)) + GLOCK_BUG_ON(gl, 1); + gfs2_glock_schedule_for_reclaim(gl); +} + +/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl) rv = 1; goto out; } - /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ - if (atomic_read(&gl->gl_ref) == 2) - gfs2_glock_schedule_for_reclaim(gl); + spin_lock(&gl->gl_spin); + gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); write_unlock(gl_lock_addr(gl->gl_hash)); out: return rv; @@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) if (held2) gfs2_glock_hold(gl); else - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); } gl->gl_state = new_state; @@ -633,12 +674,35 @@ out: out_sched: gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); goto out; } +static void delete_work_func(struct work_struct *work) +{ + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = NULL; + struct inode *inode; + u64 no_addr = 0; + + spin_lock(&gl->gl_spin); + ip = (struct gfs2_inode *)gl->gl_object; + if (ip) + no_addr = ip->i_no_addr; + spin_unlock(&gl->gl_spin); + if (ip) { + inode = gfs2_ilookup(sdp->sd_vfs, no_addr); + if (inode) { + d_prune_aliases(inode); + iput(inode); + } + } + gfs2_glock_put(gl); +} + static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; @@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_aspace = NULL; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); /* If this glock protects actual on-disk data or metadata blocks, create a VFS inode to manage the pages/buffers holding them. */ @@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl); trace_gfs2_demote_rq(gl); } @@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } -/** - * demote_ok - Check to see if it's ok to unlock a glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int demote_ok(const struct gfs2_glock *gl) -{ - const struct gfs2_glock_operations *glops = gl->gl_ops; - - if (gl->gl_state == LM_ST_UNLOCKED) - return 0; - if (!list_empty(&gl->gl_holders)) - return 0; - if (glops->go_demote_ok) - return glops->go_demote_ok(gl); - return 1; -} - static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; int nr_skipped = 0; - int got_ref = 0; LIST_HEAD(skipped); if (nr == 0) @@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) list_del_init(&gl->gl_lru); atomic_dec(&lru_count); + /* Check if glock is about to be freed */ + if (atomic_read(&gl->gl_ref) == 0) + continue; + /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { gfs2_glock_hold(gl); - got_ref = 1; spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); may_demote = demote_ok(gl); - spin_unlock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); if (may_demote) { handle_callback(gl, LM_ST_UNLOCKED, 0); nr--; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); - got_ref = 0; } + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); spin_lock(&lru_lock); - if (may_demote) - continue; - } - if (list_empty(&gl->gl_lru) && - (atomic_read(&gl->gl_ref) <= (2 + got_ref))) { - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - } - if (got_ref) { - spin_unlock(&lru_lock); - gfs2_glock_put(gl); - spin_lock(&lru_lock); - got_ref = 0; + continue; } + nr_skipped++; + list_add(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); atomic_add(nr_skipped, &lru_count); @@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void) glock_workqueue = create_workqueue("glock_workqueue"); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); + gfs2_delete_workqueue = create_workqueue("delete_workqueue"); + if (IS_ERR(gfs2_delete_workqueue)) { + destroy_workqueue(glock_workqueue); + return PTR_ERR(gfs2_delete_workqueue); + } register_shrinker(&glock_shrinker); @@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void) { unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); + destroy_workqueue(gfs2_delete_workqueue); } static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index a602a28f6f0..c609894ec0d 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -143,6 +143,7 @@ struct lm_lockops { #define GLR_TRYFAILED 13 +extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; @@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put_nolock(struct gfs2_glock *gl); int gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d5e4ab155ca..6985eef06c3 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } @@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) return 0; } +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && + ip && test_bit(GIF_USER, &ip->i_flags)) { + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put_nolock(gl); + } +} + const struct gfs2_glock_operations gfs2_meta_glops = { .go_type = LM_TYPE_META, }; @@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, }; const struct gfs2_glock_operations gfs2_flock_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 225347fbff3..61801ada36f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -159,6 +159,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback) (struct gfs2_glock *gl); const int go_type; const unsigned long go_min_hold_time; }; @@ -228,6 +229,7 @@ struct gfs2_glock { struct list_head gl_ail_list; atomic_t gl_ail_count; struct delayed_work gl_work; + struct work_struct gl_delete; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index daa4ae341a2..fba795798d3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; - if (count[1] + count[2] != tmp) { + if (count[1] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", count[1], tmp); return; } - if (count[3] != rgd->rd_dinodes) { + if (count[2] + count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_dinodes); + count[2] + count[3], rgd->rd_dinodes); return; } - - if (count[2] > count[3]) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "unlinked inodes > inodes: %u\n", - count[2]); - return; - } - } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -961,7 +953,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * Returns: The inode, if one has been found */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) +static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { struct inode *inode; u32 goal = 0, block; @@ -985,6 +978,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) goal++; if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) continue; + if (no_addr == skip) + continue; *last_unlinked = no_addr; inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, no_addr, -1, 1); @@ -1104,7 +1099,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) @@ -1138,7 +1133,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0a680133647..f522bb01797 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -353,7 +353,7 @@ fail: return error; } -static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; @@ -441,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, brelse(l_bh); } +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); +} + int gfs2_statfs_sync(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -477,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp) if (error) goto out_bh2; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - - spin_lock(&sdp->sd_statfs_spin); - m_sc->sc_total += l_sc->sc_total; - m_sc->sc_free += l_sc->sc_free; - m_sc->sc_dinodes += l_sc->sc_dinodes; - memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), - 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + update_statfs(sdp, m_bh, l_bh); gfs2_trans_end(sdp); @@ -680,6 +691,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp); gfs2_statfs_sync(sdp); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index b56413e3e40..22e0417ed99 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b..f96f85092d1 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -361,14 +367,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +383,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b..c03ac11f74b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -583,14 +564,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 91fa3ad6e8c..a29c7c3e3fb 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(value, size); } kfree(value); - if (!IS_ERR(acl)) { + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); - posix_acl_release(acl); - } return acl; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c2d061675d8..8d25ccb2d51 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1242,20 +1242,6 @@ error: return error; } -/* - * Initialize a session. - * Note: save the mount rsize and wsize for create_server negotiation. - */ -static void nfs4_init_session(struct nfs_client *clp, - unsigned int wsize, unsigned int rsize) -{ -#if defined(CONFIG_NFS_V4_1) - if (nfs4_has_session(clp)) { - clp->cl_session->fc_attrs.max_rqst_sz = wsize; - clp->cl_session->fc_attrs.max_resp_sz = rsize; - } -#endif /* CONFIG_NFS_V4_1 */ -} /* * Session has been established, and the client marked ready. @@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - nfs4_init_session(server->nfs_client, server->wsize, server->rsize); + error = nfs4_init_session(server); + if (error < 0) + goto error; /* Probe the root fh to retrieve its FSID */ error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 38d42c29fb9..32062c33c85 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 61bc3a32e1e..6ea07a3c75d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *, int reset); extern int nfs4_proc_destroy_session(struct nfs4_session *); +extern int nfs4_init_session(struct nfs_server *server); #else /* CONFIG_NFS_v4_1 */ static inline int nfs4_setup_sequence(struct nfs_client *clp, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp, { return 0; } + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff0c080db59..6917311f201 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2040,15 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - int status; nfs_fattr_init(info->fattr); - status = nfs4_recover_expired_lease(server); - if (!status) - status = nfs4_check_client_ready(server->nfs_client); - if (!status) - status = nfs4_call_sync(server, &msg, &args, &res, 0); - return status; + return nfs4_call_sync(server, &msg, &args, &res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -4099,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (request->fl_start < 0 || request->fl_end < 0) return -EINVAL; - if (IS_GETLK(cmd)) - return nfs4_proc_getlk(state, F_GETLK, request); + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) - return nfs4_proc_unlck(state, cmd, request); + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + if (state == NULL) + return -ENOLCK; do { status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) @@ -4793,6 +4795,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) return status; } +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + + clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; + clp->cl_session->fc_attrs.max_resp_sz = server->rsize; + ret = nfs4_recover_expired_lease(server); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +} + /* * Renew the cl_session lease. */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b73c5a72865..65ca8c18476 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f INIT_LIST_HEAD(&lsp->ls_sequence.list); lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; lsp->ls_owner = fl_owner; spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); @@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ if (lsp != NULL) break; if (new != NULL) { - new->ls_state = state; list_add(&new->ls_locks, &state->lock_states); set_bit(LK_STATE_IN_USE, &state->flags); lsp = new; diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig new file mode 100644 index 00000000000..72da095d400 --- /dev/null +++ b/fs/nilfs2/Kconfig @@ -0,0 +1,25 @@ +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 31dac7e3b0f..dffbb0911d0 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,15 +1,5 @@ config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. + def_bool n source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 904ff8d5405..f9c1ca139d8 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,6 +1,6 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY + select FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd7681..037e878e03f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 5356884289a..3e56dbffe72 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,7 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + select FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff27a296584..f30d9bbc2e1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -365,6 +364,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; } +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark_entry *ientry) +{ + struct idr *idr; + + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + idr_remove(idr, ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + ientry->wd = -1; +} /* * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the * internal reference help on the mark because it is in the idr. @@ -373,13 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct idr *idr; + + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -388,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); + fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); /* did the private data get added? */ if (list_empty(&fsn_event_priv->event_list)) @@ -396,14 +412,16 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + inotify_remove_from_idr(group, ientry); /* removed from idr, drop that reference */ fsnotify_put_mark(entry); + + atomic_dec(&group->inotify_data.user->inotify_watches); } /* ding dong the mark is dead */ @@ -418,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { struct fsnotify_mark_entry *entry = NULL; struct inotify_inode_mark_entry *ientry; + struct inotify_inode_mark_entry *tmp_ientry; int ret = 0; int add = (arg & IN_MASK_ADD); __u32 mask; @@ -428,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod if (unlikely(!mask)) return -EINVAL; - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) return -ENOMEM; /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->wd = -1; find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - - entry = &ientry->fsn_entry; retry: ret = -ENOMEM; if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) goto out_err; spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { if (ret == -EAGAIN) goto retry; goto out_err; } + + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + inotify_remove_from_idr(group, tmp_ientry); + if (ret == -EEXIST) + goto find_entry; + goto out_err; + } + + /* tmp_ientry has been added to the inode, so we are all set up. + * now we just need to make sure tmp_ientry doesn't get freed and + * we need to set up entry and ientry so the generic code can + * do its thing. */ + ientry = tmp_ientry; + entry = &ientry->fsn_entry; + tmp_ientry = NULL; + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* update the idr hint */ + group->inotify_data.last_wd = ientry->wd; + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(entry); } + ret = ientry->wd; + spin_lock(&entry->lock); old_mask = entry->mask; @@ -506,14 +537,19 @@ retry: fsnotify_recalc_group_mask(group); } - return ientry->wd; + /* this either matches fsnotify_find_mark_entry, or init_mark_entry + * depending on which path we took... */ + fsnotify_put_mark(entry); out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); + /* could be an error, could be that we found an existing mark */ + if (tmp_ientry) { + /* on the idr but didn't make it on the inode */ + if (tmp_ientry->wd != -1) + inotify_remove_from_idr(group, tmp_ientry); + kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); } + return ret; } @@ -721,9 +757,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 959b73e756f..521368574e9 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new { if ((old->mask == new->mask) && (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { switch (old->data_type) { case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (old->name_len && + !strcmp(old->file_name, new->file_name)) return true; break; case (FSNOTIFY_EVENT_PATH): if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): - return true; + return false; }; } return false; @@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/fs/pipe.c b/fs/pipe.c index f7dd21ad85a..52c41511483 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 70f36c043d6..38f7bd559f3 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_lock; @@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); sb->dq_op->drop(inode); } @@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, goto out_file_init; } mutex_unlock(&dqopt->dqio_mutex); - mutex_unlock(&inode->i_mutex); spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); @@ -2096,13 +2096,14 @@ out_file_init: out_lock: if (oldflags != -1) { down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); } - mutex_unlock(&inode->i_mutex); mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: put_quota_format(fmt); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ebb2c417912..11f0c06316d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -20,6 +20,7 @@ #include <linux/ramfs.h> #include <linux/pagevec.h> #include <linux/mman.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index d88d0fac9fa..14f2d71ea3c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -939,8 +939,10 @@ again: /* Remove from old parent's list and insert into new parent's list. */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); + drop_nlink(old_parent->d_inode); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; + inc_nlink(new_parent->d_inode); sysfs_link_sibling(sd); out_unlock: diff --git a/fs/udf/super.c b/fs/udf/super.c index 6832135159b..9d1b8c2e6c4 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; + sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; /* VAT file entry is in the last recorded block */ ino.partitionReferenceNum = type1_index; ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; sbi->s_vat_inode = udf_iget(sb, &ino); + if (!sbi->s_vat_inode && + sbi->s_last_block != blocks - 1) { + printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" + " last recorded block (%lu), retrying with the last " + "block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); + ino.partitionReferenceNum = type1_index; + ino.logicalBlockNum = blocks - 1 - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } if (!sbi->s_vat_inode) return 1; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7ec89fc05b2..aecf2519db7 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1268,6 +1268,14 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * VM calculation for nr_to_write seems off. Bump it way + * up, this gets simple streaming writes zippy again. + * To be reviewed again after Jens' writeback changes. + */ + wbc->nr_to_write *= 4; + /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 58973bb4603..8070b34cc28 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -680,8 +680,8 @@ xfs_vn_fiemap( else bm.bmv_length = BTOBB(length); - /* our formatter will tell xfs_getbmap when to stop. */ - bm.bmv_count = MAXEXTNUM; + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = fieinfo->fi_extents_max + 1; bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; |