From b765ead57da62cccf7fa21e00e6eed65e9df62b0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Apr 2009 10:27:10 -0400 Subject: Btrfs: keep processing bios for a given bdev if our proc is batching Btrfs uses async helper threads to submit write bios so the checksumming helper threads don't block on the disk. The submit bio threads may process bios for more than one block device, so when they find one device congested they try to move on to other devices instead of blocking in get_request_wait for one device. This does a pretty good job of keeping multiple devices busy, but the congested flag has a number of problems. A congested device may still give you a request, and other procs that aren't backing off the congested device may starve you out. This commit uses the io_context stored in current to decide if our process has been made a batching process by the block layer. If so, it keeps sending IO down for at least one batch. This helps make sure we do a good amount of work each time we visit a bdev, and avoids large IO stalls in multi-device workloads. It's also very ugly. A better solution is in the works with Jens Axboe. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd06e18e5aa..cc01abff03d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "compat.h" #include "ctree.h" @@ -145,6 +146,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run = 0; unsigned long limit; + unsigned long last_waited = 0; bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; fs_info = device->dev_root->fs_info; @@ -207,7 +209,32 @@ loop_lock: if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct bio *old_head; + struct io_context *ioc; + ioc = current->io_context; + + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + continue; + } spin_lock(&device->io_lock); old_head = device->pending_bios; -- cgit v1.2.3 From bedf762ba3a4b70295661fa70c29c1f18fe0f351 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Apr 2009 10:32:58 -0400 Subject: Btrfs: unplug in the async bio submission threads Btrfs pages being written get set to writeback, and then may go through a number of steps before they hit the block layer. This includes compression, checksumming and async bio submission. The end result is that someone who writes a page and then does wait_on_page_writeback is likely to unplug the queue before the bio they cared about got there. We could fix this by marking bios sync, or by doing more frequent unplugs, but this commit just changes the async bio submission code to unplug after it has processed all the bios for a device. The async bio submission does a fair job of collection bios, so this shouldn't be a huge problem for reducing merging at the elevator. For streaming O_DIRECT writes on a 5 drive array, it boosts performance from 386MB/s to 460MB/s. Thanks to Hisashi Hifumi for helping with this work. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc01abff03d..e0913e46972 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -148,7 +148,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -258,6 +258,18 @@ loop_lock: if (device->pending_bios) goto loop_lock; spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); done: return 0; } -- cgit v1.2.3 From 70cb074345832b75cf422ed729706345511773b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 Apr 2009 10:14:19 -0400 Subject: Btrfs: free space cache cleanups This patch cleans up the free space cache code a bit. It better documents the idiosyncrasies of tree_search_offset and makes the code make a bit more sense. I took out the info allocation at the start of __btrfs_add_free_space and put it where it makes more sense. This was left over cruft from when alloc_mutex existed. Also all of the re-searches we do to make sure we inserted properly. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/free-space-cache.c | 93 +++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5e7cae63d8..2c214781fee 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -291,8 +291,8 @@ next: block_group->key.objectid + block_group->key.offset); - remove_sb_from_cache(root, block_group); block_group->cached = 1; + remove_sb_from_cache(root, block_group); ret = 0; err: btrfs_free_path(path); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d1e5f0e84c5..69b023ff6f7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -68,14 +68,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, } /* - * searches the tree for the given offset. If contains is set we will return - * the free space that contains the given offset. If contains is not set we - * will return the free space that starts at or after the given offset and is - * at least bytes long. + * searches the tree for the given offset. + * + * fuzzy == 1: this is used for allocations where we are given a hint of where + * to look for free space. Because the hint may not be completely on an offset + * mark, or the hint may no longer point to free space we need to fudge our + * results a bit. So we look for free space starting at or after offset with at + * least bytes size. We prefer to find as close to the given offset as we can. + * Also if the offset is within a free space range, then we will return the free + * space that contains the given offset, which means we can return a free space + * chunk with an offset before the provided offset. + * + * fuzzy == 0: this is just a normal tree search. Give us the free space that + * starts at the given offset which is at least bytes size, and if its not there + * return NULL. */ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, u64 offset, u64 bytes, - int contains) + int fuzzy) { struct rb_node *n = root->rb_node; struct btrfs_free_space *entry, *ret = NULL; @@ -84,13 +94,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, entry = rb_entry(n, struct btrfs_free_space, offset_index); if (offset < entry->offset) { - if (!contains && + if (fuzzy && (!ret || entry->offset < ret->offset) && (bytes <= entry->bytes)) ret = entry; n = n->rb_left; } else if (offset > entry->offset) { - if ((entry->offset + entry->bytes - 1) >= offset && + if (fuzzy && + (entry->offset + entry->bytes - 1) >= offset && bytes <= entry->bytes) { ret = entry; break; @@ -190,55 +201,28 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; - struct btrfs_free_space *alloc_info; int ret = 0; - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!alloc_info) - return -ENOMEM; - /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 1); + offset+bytes, 0, 0); left_info = tree_search_offset(&block_group->free_space_offset, offset-1, 0, 1); - if (right_info && right_info->offset == offset+bytes) { + if (right_info) { unlink_free_space(block_group, right_info); info = right_info; info->offset = offset; info->bytes += bytes; - } else if (right_info && right_info->offset != offset+bytes) { - printk(KERN_ERR "btrfs adding space in the middle of an " - "existing free space area. existing: " - "offset=%llu, bytes=%llu. new: offset=%llu, " - "bytes=%llu\n", (unsigned long long)right_info->offset, - (unsigned long long)right_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); } - if (left_info) { + if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - if (unlikely((left_info->offset + left_info->bytes) != - offset)) { - printk(KERN_ERR "btrfs free space to the left " - "of new free space isn't " - "quite right. existing: offset=%llu, " - "bytes=%llu. new: offset=%llu, bytes=%llu\n", - (unsigned long long)left_info->offset, - (unsigned long long)left_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); - } - if (info) { info->offset = left_info->offset; info->bytes += left_info->bytes; @@ -251,13 +235,15 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, if (info) { ret = link_free_space(block_group, info); - if (!ret) - info = NULL; + if (ret) + kfree(info); goto out; } - info = alloc_info; - alloc_info = NULL; + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + info->offset = offset; info->bytes = bytes; @@ -271,8 +257,6 @@ out: BUG(); } - kfree(alloc_info); - return ret; } @@ -283,6 +267,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info; int ret = 0; + BUG_ON(!block_group->cached); info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); @@ -341,6 +326,18 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, offset - old_start); BUG_ON(ret); } else { + if (!info) { + printk(KERN_ERR "couldn't find space %llu to free\n", + (unsigned long long)offset); + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", + block_group->cached, block_group->key.objectid, + block_group->key.offset); + btrfs_dump_free_space(block_group, bytes); + } else if (info) { + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " + "but wanted offset=%llu bytes=%llu\n", + info->offset, info->bytes, offset, bytes); + } WARN_ON(1); } out: @@ -351,12 +348,9 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { int ret; - struct btrfs_free_space *sp; mutex_lock(&block_group->alloc_mutex); ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); mutex_unlock(&block_group->alloc_mutex); return ret; @@ -366,11 +360,8 @@ int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { int ret; - struct btrfs_free_space *sp; ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); return ret; } @@ -408,6 +399,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; + printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, + info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -486,7 +479,7 @@ struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache struct btrfs_free_space *ret = NULL; ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); + bytes, 1); if (!ret) ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); -- cgit v1.2.3 From 2552d17e328044d1811cae733087a1fb9aac2eb6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 Apr 2009 10:14:19 -0400 Subject: Btrfs: clean up find_free_extent I've replaced the strange looping constructs with a list_for_each_entry on space_info->block_groups. If we have a hint we just jump into the loop with the block group and start looking for space. If we don't find anything we start at the beginning and start looking. We never come out of the loop with a ref on the block_group _unless_ we found space to use, then we drop it after we set the trans block_group. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 256 ++++++++++++++++++------------------------------- 1 file changed, 91 insertions(+), 165 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2c214781fee..daff751ea6e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2556,14 +2556,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; u64 total_needed = num_bytes; u64 *last_ptr = NULL; - u64 last_wanted = 0; struct btrfs_block_group_cache *block_group = NULL; - int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - struct list_head *head = NULL, *cur = NULL; - int loop = 0; - int extra_loop = 0; + int using_hint = 0; struct btrfs_space_info *space_info; WARN_ON(num_bytes < root->sectorsize); @@ -2571,6 +2567,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; + space_info = __find_space_info(root->fs_info, data); + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; @@ -2584,10 +2582,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, last_ptr = &root->fs_info->last_data_alloc; if (last_ptr) { - if (*last_ptr) { + if (*last_ptr) hint_byte = *last_ptr; - last_wanted = *last_ptr; - } else + else empty_size += empty_cluster; } else { empty_cluster = 0; @@ -2595,187 +2592,125 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (last_wanted && search_start != last_wanted) { - last_wanted = 0; + if (search_start == hint_byte) { + using_hint = 1; + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + if (block_group && block_group_bits(block_group, data)) { + total_needed += empty_size; + down_read(&space_info->groups_sem); + goto have_block_group; + } else if (block_group) { + put_block_group(block_group); + } + empty_size += empty_cluster; + using_hint = 0; } - total_needed += empty_size; - block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(root->fs_info, - search_start); - space_info = __find_space_info(root->fs_info, data); - +search: down_read(&space_info->groups_sem); - while (1) { + list_for_each_entry(block_group, &space_info->block_groups, list) { struct btrfs_free_space *free_space; - /* - * the only way this happens if our hint points to a block - * group thats not of the proper type, while looping this - * should never happen - */ - if (empty_size) - extra_loop = 1; - if (!block_group) - goto new_group_no_lock; + atomic_inc(&block_group->count); + search_start = block_group->key.objectid; +have_block_group: if (unlikely(!block_group->cached)) { mutex_lock(&block_group->cache_mutex); ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); - if (ret) + if (ret) { + put_block_group(block_group); break; + } } mutex_lock(&block_group->alloc_mutex); - if (unlikely(!block_group_bits(block_group, data))) - goto new_group; if (unlikely(block_group->ro)) - goto new_group; + goto loop; free_space = btrfs_find_free_space(block_group, search_start, total_needed); - if (free_space) { - u64 start = block_group->key.objectid; - u64 end = block_group->key.objectid + - block_group->key.offset; + if (!free_space) + goto loop; - search_start = stripe_align(root, free_space->offset); + search_start = stripe_align(root, free_space->offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) - goto new_group; + /* move on to the next group */ + if (search_start + num_bytes >= search_end) + goto loop; - /* move on to the next group */ - if (search_start + num_bytes > end) - goto new_group; + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) + goto loop; - if (last_wanted && search_start != last_wanted) { - total_needed += empty_cluster; - empty_size += empty_cluster; - last_wanted = 0; - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - continue; - } + if (using_hint && search_start > hint_byte) + goto loop; - /* else we go to the next block group */ - goto new_group; - } + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - last_wanted = 0; - continue; - } - - /* else we go to the next block group */ - goto new_group; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) { + mutex_unlock(&block_group->alloc_mutex); + goto have_block_group; } + goto loop; + } - ins->objectid = search_start; - ins->offset = num_bytes; + ins->objectid = search_start; + ins->offset = num_bytes; - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); - /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); - break; - } -new_group: + btrfs_remove_free_space_lock(block_group, search_start, + num_bytes); + /* we are all good, lets return */ + mutex_unlock(&block_group->alloc_mutex); + break; +loop: mutex_unlock(&block_group->alloc_mutex); put_block_group(block_group); - block_group = NULL; -new_group_no_lock: - /* don't try to compare new allocations against the - * last allocation any more - */ - last_wanted = 0; - - /* - * Here's how this works. - * loop == 0: we were searching a block group via a hint - * and didn't find anything, so we start at - * the head of the block groups and keep searching - * loop == 1: we're searching through all of the block groups - * if we hit the head again we have searched - * all of the block groups for this space and we - * need to try and allocate, if we cant error out. - * loop == 2: we allocated more space and are looping through - * all of the block groups again. - */ - if (loop == 0) { - head = &space_info->block_groups; - cur = head->next; - loop++; - } else if (loop == 1 && cur == head) { - int keep_going; - - /* at this point we give up on the empty_size - * allocations and just try to allocate the min - * space. - * - * The extra_loop field was set if an empty_size - * allocation was attempted above, and if this - * is try we need to try the loop again without - * the additional empty_size. - */ - total_needed -= empty_size; - empty_size = 0; - keep_going = extra_loop; - loop++; - - if (allowed_chunk_alloc && !chunk_alloc_done) { - up_read(&space_info->groups_sem); - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - down_read(&space_info->groups_sem); - if (ret < 0) - goto loop_check; - head = &space_info->block_groups; - /* - * we've allocated a new chunk, keep - * trying - */ - keep_going = 1; - chunk_alloc_done = 1; - } else if (!allowed_chunk_alloc) { - space_info->force_alloc = 1; - } -loop_check: - if (keep_going) { - cur = head->next; - extra_loop = 0; - } else { - break; - } - } else if (cur == head) { - break; + if (using_hint) { + empty_size += empty_cluster; + total_needed += empty_cluster; + using_hint = 0; + up_read(&space_info->groups_sem); + goto search; } + } + up_read(&space_info->groups_sem); - block_group = list_entry(cur, struct btrfs_block_group_cache, - list); - atomic_inc(&block_group->count); + if (!ins->objectid && (empty_size || allowed_chunk_alloc)) { + int try_again = empty_size; - search_start = block_group->key.objectid; - cur = cur->next; + total_needed -= empty_size; + empty_size = 0; + + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + if (!ret) + try_again = 1; + allowed_chunk_alloc = 0; + } else { + space_info->force_alloc = 1; + } + + if (try_again) + goto search; + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; } /* we found what we needed */ @@ -2785,19 +2720,10 @@ loop_check: if (last_ptr) *last_ptr = ins->objectid + ins->offset; + put_block_group(block_group); ret = 0; - } else if (!ret) { - printk(KERN_ERR "btrfs searching for %llu bytes, " - "num_bytes %llu, loop %d, allowed_alloc %d\n", - (unsigned long long)total_needed, - (unsigned long long)num_bytes, - loop, allowed_chunk_alloc); - ret = -ENOSPC; } - if (block_group) - put_block_group(block_group); - up_read(&space_info->groups_sem); return ret; } -- cgit v1.2.3 From 6226cb0a5ea3f6289883753c15d53f48a6c6bbfb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 Apr 2009 10:14:18 -0400 Subject: Btrfs: kill the block group alloc mutex This patch removes the block group alloc mutex used to protect the free space tree for allocations and replaces it with a spin lock which is used only to protect the free space rb tree. This means we only take the lock when we are directly manipulating the tree, which makes us a touch faster with multi-threaded workloads. This patch also gets rid of btrfs_find_free_space and replaces it with btrfs_find_space_for_alloc, which takes the number of bytes you want to allocate, and empty_size, which is used to indicate how much free space should be at the end of the allocation. It will return an offset for the allocator to use. If we don't end up using it we _must_ call btrfs_add_free_space to put it back. This is the tradeoff to kill the alloc_mutex, since we need to make sure nobody else comes along and takes our space. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 11 +-- fs/btrfs/extent-tree.c | 46 +++++------ fs/btrfs/free-space-cache.c | 183 ++++++++++++++------------------------------ 3 files changed, 83 insertions(+), 157 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f48905ee524..527744561f9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -644,7 +644,6 @@ struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; spinlock_t lock; - struct mutex alloc_mutex; struct mutex cache_mutex; u64 pinned; u64 reserved; @@ -656,6 +655,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ + spinlock_t tree_lock; struct rb_root free_space_bytes; struct rb_root free_space_offset; @@ -2177,17 +2177,12 @@ int btrfs_acl_chmod(struct inode *inode); /* free-space-cache.c */ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group); -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index daff751ea6e..6880a271975 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2554,7 +2554,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 total_needed = num_bytes; u64 *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; @@ -2597,7 +2596,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, block_group = btrfs_lookup_block_group(root->fs_info, search_start); if (block_group && block_group_bits(block_group, data)) { - total_needed += empty_size; down_read(&space_info->groups_sem); goto have_block_group; } else if (block_group) { @@ -2611,7 +2609,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { - struct btrfs_free_space *free_space; + u64 offset; atomic_inc(&block_group->count); search_start = block_group->key.objectid; @@ -2627,62 +2625,65 @@ have_block_group: } } - mutex_lock(&block_group->alloc_mutex); - if (unlikely(block_group->ro)) goto loop; - free_space = btrfs_find_free_space(block_group, search_start, - total_needed); - if (!free_space) + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + if (!offset) goto loop; - search_start = stripe_align(root, free_space->offset); + search_start = stripe_align(root, offset); /* move on to the next group */ - if (search_start + num_bytes >= search_end) + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; + } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; + } - if (using_hint && search_start > hint_byte) + if (using_hint && search_start > hint_byte) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; + } if (exclude_nr > 0 && (search_start + num_bytes > exclude_start && search_start < exclude_start + exclude_nr)) { search_start = exclude_start + exclude_nr; + btrfs_add_free_space(block_group, offset, num_bytes); /* * if search_start is still in this block group * then we just re-search this block group */ if (search_start >= block_group->key.objectid && search_start < (block_group->key.objectid + - block_group->key.offset)) { - mutex_unlock(&block_group->alloc_mutex); + block_group->key.offset)) goto have_block_group; - } goto loop; } ins->objectid = search_start; ins->offset = num_bytes; - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); break; loop: - mutex_unlock(&block_group->alloc_mutex); put_block_group(block_group); if (using_hint) { empty_size += empty_cluster; - total_needed += empty_cluster; using_hint = 0; up_read(&space_info->groups_sem); goto search; @@ -2693,7 +2694,6 @@ loop: if (!ins->objectid && (empty_size || allowed_chunk_alloc)) { int try_again = empty_size; - total_needed -= empty_size; empty_size = 0; if (allowed_chunk_alloc) { @@ -5782,7 +5782,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); read_extent_buffer(leaf, &cache->item, @@ -5838,7 +5838,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 69b023ff6f7..df19b60eef6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -182,6 +182,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int ret = 0; + BUG_ON(!info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, &info->offset_index); if (ret) @@ -195,14 +196,23 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, return ret; } -static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; int ret = 0; + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to @@ -215,42 +225,23 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, if (right_info) { unlink_free_space(block_group, right_info); - info = right_info; - info->offset = offset; - info->bytes += bytes; + info->bytes += right_info->bytes; + kfree(right_info); } if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - - if (info) { - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kfree(left_info); - } else { - info = left_info; - info->bytes += bytes; - } - } - - if (info) { - ret = link_free_space(block_group, info); - if (ret) - kfree(info); - goto out; + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); } - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(block_group, info); if (ret) kfree(info); -out: + + spin_unlock(&block_group->tree_lock); + if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); if (ret == -EEXIST) @@ -260,17 +251,16 @@ out: return ret; } -static int -__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *info; int ret = 0; - BUG_ON(!block_group->cached); + spin_lock(&block_group->tree_lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - if (info && info->offset == offset) { if (info->bytes < bytes) { printk(KERN_ERR "Found free space at %llu, size %llu," @@ -280,12 +270,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; + spin_unlock(&block_group->tree_lock); goto out; } unlink_free_space(block_group, info); if (info->bytes == bytes) { kfree(info); + spin_unlock(&block_group->tree_lock); goto out; } @@ -293,6 +285,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->bytes -= bytes; ret = link_free_space(block_group, info); + spin_unlock(&block_group->tree_lock); BUG_ON(ret); } else if (info && info->offset < offset && info->offset + info->bytes >= offset + bytes) { @@ -318,14 +311,15 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, */ kfree(info); } - + spin_unlock(&block_group->tree_lock); /* step two, insert a new info struct to cover anything * before the hole */ - ret = __btrfs_add_free_space(block_group, old_start, - offset - old_start); + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); BUG_ON(ret); } else { + spin_unlock(&block_group->tree_lock); if (!info) { printk(KERN_ERR "couldn't find space %llu to free\n", (unsigned long long)offset); @@ -344,50 +338,6 @@ out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_add_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_add_free_space(block_group, offset, bytes); - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret = 0; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_remove_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_remove_free_space(block_group, offset, bytes); - - return ret; -} - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { @@ -426,63 +376,44 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_free_space *info; struct rb_node *node; - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); kfree(info); if (need_resched()) { - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); cond_resched(); - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); } } - mutex_unlock(&block_group->alloc_mutex); -} - -#if 0 -static struct btrfs_free_space *btrfs_find_free_space_offset(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) -{ - struct btrfs_free_space *ret; - - mutex_lock(&block_group->alloc_mutex); - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - mutex_unlock(&block_group->alloc_mutex); - - return ret; + spin_unlock(&block_group->tree_lock); } -static struct btrfs_free_space *btrfs_find_free_space_bytes(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret; - - mutex_lock(&block_group->alloc_mutex); - - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} -#endif - -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) -{ - struct btrfs_free_space *ret = NULL; + struct btrfs_free_space *entry = NULL; + u64 ret = 0; - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 1); - if (!ret) - ret = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes); + spin_lock(&block_group->tree_lock); + entry = tree_search_offset(&block_group->free_space_offset, offset, + bytes + empty_size, 1); + if (!entry) + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes + empty_size); + if (entry) { + unlink_free_space(block_group, entry); + ret = entry->offset; + entry->offset += bytes; + entry->bytes -= bytes; + + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + spin_unlock(&block_group->tree_lock); return ret; } -- cgit v1.2.3 From 04018de5d41e6490840de9399e029fd30e78576f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 Apr 2009 10:14:18 -0400 Subject: Btrfs: kill the pinned_mutex This patch removes the pinned_mutex. The extent io map has an internal tree lock that protects the tree itself, and since we only copy the extent io map when we are committing the transaction we don't need it there. We also don't need it when caching the block group since searching through the tree is also protected by the internal map spin lock. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/extent-tree.c | 11 ----------- fs/btrfs/tree-log.c | 4 +--- 4 files changed, 1 insertion(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 527744561f9..aaa049b8e13 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -728,7 +728,6 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1747dfd1865..ea59ebfa505 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1647,7 +1647,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6880a271975..15d62a9214b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -166,7 +166,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, u64 extent_start, extent_end, size; int ret; - mutex_lock(&info->pinned_mutex); while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, @@ -192,7 +191,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - mutex_unlock(&info->pinned_mutex); return 0; } @@ -2047,7 +2045,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); if (pin) { set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); @@ -2055,7 +2052,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } - mutex_unlock(&root->fs_info->pinned_mutex); while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2127,7 +2123,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(pinned_extents, last, &start, &end, EXTENT_DIRTY); @@ -2136,7 +2131,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } - mutex_unlock(&root->fs_info->pinned_mutex); return 0; } @@ -2149,7 +2143,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int ret; while (1) { - mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2163,7 +2156,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_unlock(&root->fs_info->pinned_mutex); return ret; } @@ -2205,7 +2197,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, free_extent_buffer(buf); pinit: btrfs_set_path_blocking(path); - mutex_lock(&root->fs_info->pinned_mutex); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); @@ -2511,8 +2502,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc9b87a7975..2871609641f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) -- cgit v1.2.3 From c8c42864f6193638eed136e0243f426a0b7f4bc2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Apr 2009 10:14:18 -0400 Subject: Btrfs: break up btrfs_search_slot into smaller pieces btrfs_search_slot was doing too many things at once. This breaks it up into more reasonable units. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 221 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 90 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dbb72412463..271b05e507d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static noinline void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1446,6 +1446,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) } } +/* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + return -EAGAIN; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow) { struct extent_buffer *b; - struct extent_buffer *tmp; int slot; int ret; int level; - int should_reada = p->reada; int lowest_unlock = 1; - int blocksize; u8 lowest_level = 0; - u64 blocknr; - u64 gen; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1502,7 +1608,11 @@ again: if (cow) { int wret; - /* is a cow on this block not required */ + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -1557,51 +1667,15 @@ cow_done: if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if ((p->search_for_split || ins_len > 0) && - btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - } else if (ins_len < 0 && - btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + ret = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (ret == -EAGAIN) + goto again; + else if (ret) + goto done; + b = p->nodes[level]; + slot = p->slots[level]; - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(NULL, p); - goto again; - } - slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(b) == 1); - } unlock_up(p, level, lowest_unlock); /* this is only true while dropping a snapshot */ @@ -1610,44 +1684,11 @@ cow_done: goto done; } - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + ret = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (ret == -EAGAIN) + goto again; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - b = tmp; - } else { - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. - */ - if (level > 0) { - btrfs_release_path(NULL, p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - - tmp = read_tree_block(root, blocknr, - blocksize, gen); - if (tmp) - free_extent_buffer(tmp); - goto again; - } else { - btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - b = read_node_slot(root, b, slot); - } - } if (!p->skip_locking) { int lret; -- cgit v1.2.3 From 8e73f275011b3264a87339fd9f1690e944e381c9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Apr 2009 10:14:18 -0400 Subject: Btrfs: Optimize locking in btrfs_next_leaf() btrfs_next_leaf was using blocking locks when it could have been using faster spinning ones instead. This adds a few extra checks around the pieces that block and switches over to spinning locks. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 88 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 271b05e507d..b8082762ca7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4127,28 +4127,44 @@ next: int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { int slot; - int level = 1; + int level; struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct extent_buffer *next; struct btrfs_key key; u32 nritems; int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; btrfs_release_path(root, path); + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) return ret; - btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -4158,19 +4174,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (nritems > 0 && path->slots[0] < nritems - 1) { path->slots[0]++; + ret = 0; goto done; } while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + if (!path->nodes[level]) { + ret = 1; + goto done; + } slot = path->slots[level] + 1; c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) - return 1; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } continue; } @@ -4179,16 +4200,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } - /* the path was set to blocking above */ - if (level == 1 && (path->locks[1] || path->skip_locking) && - path->reada) - reada_for_search(root, path, level, slot, 0); + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; - next = read_node_slot(root, c, slot); if (!path->skip_locking) { - btrfs_assert_tree_locked(c); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } break; } @@ -4198,27 +4225,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (path->locks[level]) btrfs_tree_unlock(c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) path->locks[level] = 1; + if (!level) break; - btrfs_set_path_blocking(path); - if (level == 1 && path->locks[1] && path->reada) - reada_for_search(root, path, level, slot, 0); - next = read_node_slot(root, next, 0); + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } } + ret = 0; done: unlock_up(path, 0, 1); - return 0; + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; } /* -- cgit v1.2.3 From fa9c0d795f7b57c76560b7fac703f5d341210e28 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Apr 2009 09:47:43 -0400 Subject: Btrfs: rework allocation clustering Because btrfs is copy-on-write, we end up picking new locations for blocks very often. This makes it fairly difficult to maintain perfect read patterns over time, but we can at least do some optimizations for writes. This is done today by remembering the last place we allocated and trying to find a free space hole big enough to hold more than just one allocation. The end result is that we tend to write sequentially to the drive. This happens all the time for metadata and it happens for data when mounted -o ssd. But, the way we record it is fairly racey and it tends to fragment the free space over time because we are trying to allocate fairly large areas at once. This commit gets rid of the races by adding a free space cluster object with dedicated locking to make sure that only one process at a time is out replacing the cluster. The free space fragmentation is somewhat solved by allowing a cluster to be comprised of smaller free space extents. This part definitely adds some CPU time to the cluster allocations, but it allows the allocator to consume the small holes left behind by cow. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 54 +++++--- fs/btrfs/disk-io.c | 5 + fs/btrfs/extent-tree.c | 181 +++++++++++++++++++-------- fs/btrfs/free-space-cache.c | 297 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 44 +++++++ fs/btrfs/transaction.c | 2 - 6 files changed, 509 insertions(+), 74 deletions(-) create mode 100644 fs/btrfs/free-space-cache.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aaa049b8e13..b82931f97ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -633,11 +633,29 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; }; -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; }; struct btrfs_block_group_cache { @@ -667,6 +685,11 @@ struct btrfs_block_group_cache { /* usage count */ atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; }; struct btrfs_leaf_ref_tree { @@ -838,8 +861,12 @@ struct btrfs_fs_info { spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; - u64 last_alloc; - u64 last_data_alloc; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -1747,6 +1774,7 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); @@ -2173,16 +2201,4 @@ int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* free-space-cache.c */ -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ea59ebfa505..e68ef7b456b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -38,6 +38,7 @@ #include "locking.h" #include "ref-cache.h" #include "tree-log.h" +#include "free-space-cache.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -1652,6 +1653,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 15d62a9214b..178df4c67de 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" +#include "free-space-cache.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -324,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -static inline void put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { if (atomic_dec_and_test(&cache->count)) kfree(cache); @@ -397,12 +398,12 @@ again: div_factor(cache->key.offset, factor)) { group_start = cache->key.objectid; spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); goto found; } } spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); cond_resched(); } if (!wrapped) { @@ -1592,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) if (!block_group || block_group->ro) readonly = 1; if (block_group) - put_block_group(block_group); + btrfs_put_block_group(block_group); return readonly; } @@ -2016,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, WARN_ON(ret); } } - put_block_group(cache); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; } @@ -2033,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return 0; bytenr = cache->key.objectid; - put_block_group(cache); + btrfs_put_block_group(cache); return bytenr; } @@ -2077,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, if (cache->cached) btrfs_add_free_space(cache, bytenr, len); } - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2108,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root, } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2543,12 +2544,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 *last_ptr = NULL; + struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - int using_hint = 0; struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -2561,38 +2563,39 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, allowed_chunk_alloc = 1; if (data & BTRFS_BLOCK_GROUP_METADATA) { - last_ptr = &root->fs_info->last_alloc; + last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) - last_ptr = &root->fs_info->last_data_alloc; + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } if (last_ptr) { - if (*last_ptr) - hint_byte = *last_ptr; - else - empty_size += empty_cluster; - } else { - empty_cluster = 0; + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); } + search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); + if (!last_ptr) { + empty_cluster = 0; + loop = 1; + } + if (search_start == hint_byte) { - using_hint = 1; block_group = btrfs_lookup_block_group(root->fs_info, search_start); if (block_group && block_group_bits(block_group, data)) { down_read(&space_info->groups_sem); goto have_block_group; } else if (block_group) { - put_block_group(block_group); + btrfs_put_block_group(block_group); } - - empty_size += empty_cluster; - using_hint = 0; } search: @@ -2609,7 +2612,7 @@ have_block_group: ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); if (ret) { - put_block_group(block_group); + btrfs_put_block_group(block_group); break; } } @@ -2617,11 +2620,88 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; + if (last_ptr) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + atomic_inc(&block_group->count); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + goto have_block_group; + } + spin_unlock(&last_ptr->lock); + + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + last_ptr_loop = 0; + + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { + /* + * now pull our allocation out of this + * cluster + */ + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } + } + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + if (loop < 2) { + btrfs_return_cluster_to_free_space(NULL, + last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; + } + spin_unlock(&last_ptr->refill_lock); + } + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); if (!offset) goto loop; - +checks: search_start = stripe_align(root, offset); /* move on to the next group */ @@ -2637,11 +2717,6 @@ have_block_group: goto loop; } - if (using_hint && search_start > hint_byte) { - btrfs_add_free_space(block_group, offset, num_bytes); - goto loop; - } - if (exclude_nr > 0 && (search_start + num_bytes > exclude_start && search_start < exclude_start + exclude_nr)) { @@ -2670,33 +2745,33 @@ have_block_group: /* we are all good, lets return */ break; loop: - put_block_group(block_group); - if (using_hint) { - empty_size += empty_cluster; - using_hint = 0; - up_read(&space_info->groups_sem); - goto search; - } + btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); - if (!ins->objectid && (empty_size || allowed_chunk_alloc)) { - int try_again = empty_size; - - empty_size = 0; + /* loop == 0, try to find a clustered alloc in every block group + * loop == 1, try again after forcing a chunk allocation + * loop == 2, set empty_size and empty_cluster to 0 and try again + */ + if (!ins->objectid && loop < 3 && + (empty_size || empty_cluster || allowed_chunk_alloc)) { + if (loop >= 2) { + empty_size = 0; + empty_cluster = 0; + } if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, 1); - if (!ret) - try_again = 1; allowed_chunk_alloc = 0; } else { space_info->force_alloc = 1; } - if (try_again) + if (loop < 3) { + loop++; goto search; + } ret = -ENOSPC; } else if (!ins->objectid) { ret = -ENOSPC; @@ -2707,9 +2782,7 @@ loop: if (!(data & BTRFS_BLOCK_GROUP_DATA)) trans->block_group = block_group->key.objectid; - if (last_ptr) - *last_ptr = ins->objectid + ins->offset; - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; } @@ -2817,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - put_block_group(cache); + btrfs_put_block_group(cache); update_reserved_extents(root, start, len, 0); return ret; @@ -2955,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); BUG_ON(ret); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, ref_generation, owner, ins, 1); return ret; @@ -5644,7 +5717,7 @@ next: WARN_ON(block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&block_group->item) > 0); spin_unlock(&block_group->lock); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; out: btrfs_free_path(path); @@ -5774,6 +5847,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -5830,6 +5904,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); @@ -5889,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); block_group->space_info->full = 0; - put_block_group(block_group); - put_block_group(block_group); + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index df19b60eef6..3fdadd28e93 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,15 @@ #include #include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; static int tree_insert_offset(struct rb_root *root, u64 offset, struct rb_node *node) @@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) return ret; } +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->window_start = 0; + node = rb_first(&cluster->root); + while(node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + link_free_space(block_group, entry); + } + list_del_init(&cluster->block_group_list); + + btrfs_put_block_group(cluster->block_group); + cluster->block_group = NULL; + cluster->root.rb_node = NULL; +out: + spin_unlock(&cluster->lock); + return 0; +} + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space *info; struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct btrfs_free_cluster *safe; spin_lock(&block_group->tree_lock); + + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, + block_group_list) { + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + } + while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); @@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, return ret; } + +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + int ret; + + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); + + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + + /* finally drop our ref */ + btrfs_put_block_group(block_group); + return ret; +} + +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; + + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + int total_retries = 0; + int ret; + + /* for metadata, allow allocates with more holes */ + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + min_bytes = min(min_bytes, bytes + empty_size); + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, min_bytes); + if (!entry) { + ret = -ENOSPC; + goto out; + } + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while(1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; + + node = rb_next(&last->offset_index); + if (!node) { + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = 0; + total_retries++; + if (total_retries % 256 == 0) { + if (min_bytes >= (bytes + empty_size)) { + ret = -ENOSPC; + goto out; + } + /* + * grow our allocation a bit, we're not having + * much luck + */ + min_bytes *= 2; + goto again; + } + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while(1) { + node = rb_next(&entry->offset_index); + unlink_free_space(block_group, entry); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + ret = 0; + cluster->max_size = max_extent; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 00000000000..ab0bdc0a63c --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 664782c6a2d..3e8225de4e9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root) GFP_NOFS); BUG_ON(!cur_trans); root->fs_info->generation++; - root->fs_info->last_alloc = 0; - root->fs_info->last_data_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; -- cgit v1.2.3 From a9572a15a854bd40453a6b7b4dd9d29132f64c64 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: introduce btrfs_show_options btrfs options can change at times other than mount, yet /proc/mounts shows the options string used when the fs was mounted (an example would be when btrfs determines that barriers aren't useful and turns them off.) This patch instead outputs the actual options in use by btrfs. Signed-off-by: Eric Paris Signed-off-by: Chris Mason --- fs/btrfs/super.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 19a4daf03cc..2ceb40b794a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -385,6 +386,37 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; +} + static void btrfs_write_super(struct super_block *sb) { sb->s_dirt = 0; @@ -630,7 +662,7 @@ static struct super_operations btrfs_super_ops = { .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, - .show_options = generic_show_options, + .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, -- cgit v1.2.3 From 3a5e14048a0a81276d284cbda441507a17e26147 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 2 Apr 2009 16:49:40 -0400 Subject: Btrfs: notreelog mount option Add a 'notreelog' mount option to disable the tree log (used by fsync, O_SYNC writes). This is much slower, but the tree logging produces inconsistent views into the FS for ceph. Signed-off-by: Sage Weil Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/super.c | 10 +++++++++- fs/btrfs/tree-log.c | 5 +++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b82931f97ef..1e99a994863 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1036,6 +1036,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2ceb40b794a..3baa2c109e5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -67,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, + Opt_err, }; static match_table_t tokens = { @@ -84,6 +85,7 @@ static match_table_t tokens = { {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, {Opt_err, NULL}, }; @@ -223,6 +225,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; default: break; } @@ -412,6 +418,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",compress"); if (btrfs_test_opt(root, SSD)) seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",notreelog"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2871609641f..c047eea4093 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2898,6 +2898,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; -- cgit v1.2.3 From dccae99995089641fbac452ebc7f0cab18751ddb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 2 Apr 2009 16:59:01 -0400 Subject: Btrfs: add flushoncommit mount option The 'flushoncommit' mount option forces any data dirtied by a write in a prior transaction to commit as part of the current commit. This makes the committed state a fully consistent view of the file system from the application's perspective (i.e., it includes all completed file system operations). This was previously the behavior only when a snapshot is created. This is used by Ceph to ensure that completed writes make it to the platter along with the metadata operations they are bound to (by BTRFS_IOC_TRANS_{START,END}). Signed-off-by: Sage Weil Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/super.c | 14 ++++++++++---- fs/btrfs/transaction.c | 5 ++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e99a994863..bb6ac5b8765 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1037,6 +1037,7 @@ struct btrfs_root { #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) #define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3baa2c109e5..9744af9d71e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -68,7 +68,7 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_err, + Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -86,6 +86,7 @@ static match_table_t tokens = { {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, {Opt_err, NULL}, }; @@ -229,6 +230,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: disabling tree log\n"); btrfs_set_opt(info->mount_opt, NOTREELOG); break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; default: break; } @@ -370,9 +375,8 @@ fail_close: int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_sb(sb); int ret; - root = btrfs_sb(sb); if (sb->s_flags & MS_RDONLY) return 0; @@ -419,7 +423,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, SSD)) seq_puts(seq, ",ssd"); if (btrfs_test_opt(root, NOTREELOG)) - seq_puts(seq, ",notreelog"); + seq_puts(seq, ",no-treelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flush-on-commit"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3e8225de4e9..2869b3361eb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -972,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); btrfs_run_ordered_operations(root, 0); @@ -1051,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (snap_pending) { + if (flush_on_commit || snap_pending) { + if (flush_on_commit) + btrfs_start_delalloc_inodes(root); ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } -- cgit v1.2.3 From b5555f77115bfba3630fefff2d4e68140b644171 Mon Sep 17 00:00:00 2001 From: Amit Gud Date: Thu, 2 Apr 2009 17:01:27 -0400 Subject: Btrfs: fix race in worker_loop Need to check kthread_should_stop after schedule_timeout() before calling schedule(). This causes threads to sleep with potentially no one to wake them up causing mount(2) to hang in btrfs_stop_workers waiting for threads to stop. Signed-off-by: Amit Gud Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c84ca1f5259..cba701dba35 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -195,6 +195,9 @@ again_locked: if (!list_empty(&worker->pending)) continue; + if (kthread_should_stop()) + break; + /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); @@ -208,7 +211,8 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); } __set_current_state(TASK_RUNNING); } -- cgit v1.2.3 From 09771430f3b46ee27c69daa7ecad82007568e834 Mon Sep 17 00:00:00 2001 From: Shen Feng Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: free inode struct when btrfs_new_inode fails btrfs_new_inode doesn't call iput to free the inode when it fails. Signed-off-by: Shen Feng Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1cff528d5b5..0ecb3fa75dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3565,6 +3567,7 @@ fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } -- cgit v1.2.3 From 93dbfad7ac647d4f00354893fe93e8a55be114d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 3 Apr 2009 10:33:45 -0400 Subject: Btrfs: fix __ucmpdi2 compile bug on 32 bit builds We get this on 32 builds: fs/built-in.o: In function `extent_fiemap': (.text+0x1019f2): undefined reference to `__ucmpdi2' Happens because of a switch statement with a 64 bit argument. Convert this to an if statement to fix this. Signed-off-by: Heiko Carstens Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 08085af089e..eb2bee8b7fb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, disko = 0; flags = 0; - switch (em->block_start) { - case EXTENT_MAP_LAST_BYTE: + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - break; - case EXTENT_MAP_HOLE: + } else if (em->block_start == EXTENT_MAP_HOLE) { flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case EXTENT_MAP_INLINE: + } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); - break; - case EXTENT_MAP_DELALLOC: + } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - break; - default: + } else { disko = em->block_start; - break; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -- cgit v1.2.3 From 2e966ed22c3c56227f8a7322d7b008945352e6ab Mon Sep 17 00:00:00 2001 From: Jim Owens Date: Thu, 2 Apr 2009 17:02:55 -0400 Subject: Btrfs: remove unused ftrace include Signed-off-by: jim owens Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 1 - fs/btrfs/delayed-ref.c | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index cba701dba35..51bfdfc8fcd 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "async-thread.h" #define WORK_QUEUED_BIT 0 diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cbf7dc8ae3e..d6c01c096a4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -18,7 +18,6 @@ #include #include -#include #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" -- cgit v1.2.3 From d4a789474a6213d1b55b363fb1787b0abf877bba Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: fix typos in comments Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 20 +++++++++++--------- fs/btrfs/locking.c | 4 ++-- fs/btrfs/volumes.h | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bb6ac5b8765..8f4e152bb11 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_MAX 9 /* - * the key defines the order in the tree, and so it also defines (optimal) - * block layout. objectid corresonds to the inode number. The flags - * tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with flags of 1 might refer to the inode - * data, flags of 2 may point to file data in the btree and flags == 3 - * may point to extents. + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. * * offset is the starting byte offset for this key in the stream. * @@ -200,7 +203,7 @@ struct btrfs_dev_item { /* * starting byte of this partition on the device, - * to allowr for stripe alignment in the future + * to allow for stripe alignment in the future */ __le64 start_offset; @@ -958,7 +961,6 @@ struct btrfs_root { }; /* - * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -989,7 +991,7 @@ struct btrfs_root { #define BTRFS_EXTENT_CSUM_KEY 128 /* - * root items point to tree roots. There are typically in the root + * root items point to tree roots. They are typically in the root * tree used by the super block to find all the other trees */ #define BTRFS_ROOT_ITEM_KEY 132 diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a5310c0f41e..1c36e5cd8f5 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) /* * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for every long, and often they don't block - * at all. For a dbench 50 run, if we don't spin one the blocking bit + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit * at all, the context switch rate can jump up to 400,000/sec or more. * * So, we're still stuck with this crummy spin on the blocking bit, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86c44e9ae11..2185de72ff7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -76,7 +76,7 @@ struct btrfs_device { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent coyp of the super */ + /* the device with this id has the most recent copy of the super */ u64 latest_devid; u64 latest_trans; u64 num_devices; -- cgit v1.2.3 From ff0a5836ac48554b9f3b9d3c5f5bce4ddea11f1f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: remove dead code merge is always NULL at this point. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 50da69da20c..b187917b36f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; - free_extent_map(merge); goto out; } atomic_inc(&em->refs); -- cgit v1.2.3 From 3e7ad38d20ad113158d1b4c9de0f51c04f50d4f7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Apr 2009 16:46:06 -0400 Subject: Btrfs: remove dead code Remove an unneeded return statement and conditional Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e68ef7b456b..a850c3ac196 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1413,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio) ret = extent_range_uptodate(io_tree, start + length, start + buf_len - 1); - if (ret == 1) - return ret; return ret; } -- cgit v1.2.3 From c293498be69816087746161338de4b81efdf69fc Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Thu, 2 Apr 2009 17:05:11 -0400 Subject: Btrfs: BUG to BUG_ON changes Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 3 +-- fs/btrfs/free-space-cache.c | 3 +-- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b8082762ca7..e5b2533b691 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2157,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(!path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); - if (slot > nritems) - BUG(); + BUG_ON(slot > nritems); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3fdadd28e93..768b9523662 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -253,8 +253,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); - if (ret == -EEXIST) - BUG(); + BUG_ON(ret == -EEXIST); } return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c047eea4093..25f20ea11f2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1222,8 +1222,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } -- cgit v1.2.3